2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34 #include "diracdsp_mmx.h"
39 /* pixel operations */
40 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
44 { 0x8000000080000000ULL, 0x8000000080000000ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
67 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
82 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
83 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
85 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
86 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
94 "paddb %%"#regd", %%"#regd" \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
100 // for shared library it's better to use this way for accessing constants
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
105 "psrlw $15, %%"#regd" \n\t" \
106 "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
111 "psrlw $15, %%"#regd" \n\t" \
112 "psllw $1, %%"#regd" \n\t"::)
116 // using regr as temporary and for the output result
117 // first argument is unmodifed and second is trashed
118 // regfe is supposed to contain 0xfefefefefefefefe
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq "#rega", "#regr" \n\t" \
121 "pand "#regb", "#regr" \n\t" \
122 "pxor "#rega", "#regb" \n\t" \
123 "pand "#regfe", "#regb" \n\t" \
124 "psrlq $1, "#regb" \n\t" \
125 "paddb "#regb", "#regr" \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq "#rega", "#regr" \n\t" \
129 "por "#regb", "#regr" \n\t" \
130 "pxor "#rega", "#regb" \n\t" \
131 "pand "#regfe", "#regb" \n\t" \
132 "psrlq $1, "#regb" \n\t" \
133 "psubb "#regb", "#regr" \n\t"
135 // mm6 is supposed to contain 0xfefefefefefefefe
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq "#rega", "#regr" \n\t" \
138 "movq "#regc", "#regp" \n\t" \
139 "pand "#regb", "#regr" \n\t" \
140 "pand "#regd", "#regp" \n\t" \
141 "pxor "#rega", "#regb" \n\t" \
142 "pxor "#regc", "#regd" \n\t" \
143 "pand %%mm6, "#regb" \n\t" \
144 "pand %%mm6, "#regd" \n\t" \
145 "psrlq $1, "#regb" \n\t" \
146 "psrlq $1, "#regd" \n\t" \
147 "paddb "#regb", "#regr" \n\t" \
148 "paddb "#regd", "#regp" \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq "#rega", "#regr" \n\t" \
152 "movq "#regc", "#regp" \n\t" \
153 "por "#regb", "#regr" \n\t" \
154 "por "#regd", "#regp" \n\t" \
155 "pxor "#rega", "#regb" \n\t" \
156 "pxor "#regc", "#regd" \n\t" \
157 "pand %%mm6, "#regb" \n\t" \
158 "pand %%mm6, "#regd" \n\t" \
159 "psrlq $1, "#regd" \n\t" \
160 "psrlq $1, "#regb" \n\t" \
161 "psubb "#regb", "#regr" \n\t" \
162 "psubb "#regd", "#regp" \n\t"
164 /***********************************/
165 /* MMX no rounding */
166 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 #include "dsputil_mmx_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_mmx_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
201 #include "dsputil_mmx_avg_template.c"
207 /***********************************/
210 #define DEF(x) x ## _mmx2
212 /* Introduced only in MMX2 set */
213 #define PAVGB "pavgb"
216 #include "dsputil_mmx_avg_template.c"
222 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
223 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
224 #define put_pixels16_mmx2 put_pixels16_mmx
225 #define put_pixels8_mmx2 put_pixels8_mmx
226 #define put_pixels4_mmx2 put_pixels4_mmx
227 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
228 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
229 #define put_pixels16_3dnow put_pixels16_mmx
230 #define put_pixels8_3dnow put_pixels8_mmx
231 #define put_pixels4_3dnow put_pixels4_mmx
232 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
233 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
235 /***********************************/
238 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
244 /* read the pixels */
249 "movq %3, %%mm0 \n\t"
250 "movq 8%3, %%mm1 \n\t"
251 "movq 16%3, %%mm2 \n\t"
252 "movq 24%3, %%mm3 \n\t"
253 "movq 32%3, %%mm4 \n\t"
254 "movq 40%3, %%mm5 \n\t"
255 "movq 48%3, %%mm6 \n\t"
256 "movq 56%3, %%mm7 \n\t"
257 "packuswb %%mm1, %%mm0 \n\t"
258 "packuswb %%mm3, %%mm2 \n\t"
259 "packuswb %%mm5, %%mm4 \n\t"
260 "packuswb %%mm7, %%mm6 \n\t"
261 "movq %%mm0, (%0) \n\t"
262 "movq %%mm2, (%0, %1) \n\t"
263 "movq %%mm4, (%0, %1, 2) \n\t"
264 "movq %%mm6, (%0, %2) \n\t"
265 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
268 pix += line_size * 4;
271 // if here would be an exact copy of the code above
272 // compiler would generate some very strange code
275 "movq (%3), %%mm0 \n\t"
276 "movq 8(%3), %%mm1 \n\t"
277 "movq 16(%3), %%mm2 \n\t"
278 "movq 24(%3), %%mm3 \n\t"
279 "movq 32(%3), %%mm4 \n\t"
280 "movq 40(%3), %%mm5 \n\t"
281 "movq 48(%3), %%mm6 \n\t"
282 "movq 56(%3), %%mm7 \n\t"
283 "packuswb %%mm1, %%mm0 \n\t"
284 "packuswb %%mm3, %%mm2 \n\t"
285 "packuswb %%mm5, %%mm4 \n\t"
286 "packuswb %%mm7, %%mm6 \n\t"
287 "movq %%mm0, (%0) \n\t"
288 "movq %%mm2, (%0, %1) \n\t"
289 "movq %%mm4, (%0, %1, 2) \n\t"
290 "movq %%mm6, (%0, %2) \n\t"
291 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
295 #define put_signed_pixels_clamped_mmx_half(off) \
296 "movq "#off"(%2), %%mm1 \n\t" \
297 "movq 16 + "#off"(%2), %%mm2 \n\t" \
298 "movq 32 + "#off"(%2), %%mm3 \n\t" \
299 "movq 48 + "#off"(%2), %%mm4 \n\t" \
300 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
301 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
302 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
303 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
304 "paddb %%mm0, %%mm1 \n\t" \
305 "paddb %%mm0, %%mm2 \n\t" \
306 "paddb %%mm0, %%mm3 \n\t" \
307 "paddb %%mm0, %%mm4 \n\t" \
308 "movq %%mm1, (%0) \n\t" \
309 "movq %%mm2, (%0, %3) \n\t" \
310 "movq %%mm3, (%0, %3, 2) \n\t" \
311 "movq %%mm4, (%0, %1) \n\t"
313 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
316 x86_reg line_skip = line_size;
320 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
321 "lea (%3, %3, 2), %1 \n\t"
322 put_signed_pixels_clamped_mmx_half(0)
323 "lea (%0, %3, 4), %0 \n\t"
324 put_signed_pixels_clamped_mmx_half(64)
325 : "+&r"(pixels), "=&r"(line_skip3)
326 : "r"(block), "r"(line_skip)
330 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
337 /* read the pixels */
344 "movq (%2), %%mm0 \n\t"
345 "movq 8(%2), %%mm1 \n\t"
346 "movq 16(%2), %%mm2 \n\t"
347 "movq 24(%2), %%mm3 \n\t"
348 "movq %0, %%mm4 \n\t"
349 "movq %1, %%mm6 \n\t"
350 "movq %%mm4, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm4 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm4, %%mm0 \n\t"
354 "paddsw %%mm5, %%mm1 \n\t"
355 "movq %%mm6, %%mm5 \n\t"
356 "punpcklbw %%mm7, %%mm6 \n\t"
357 "punpckhbw %%mm7, %%mm5 \n\t"
358 "paddsw %%mm6, %%mm2 \n\t"
359 "paddsw %%mm5, %%mm3 \n\t"
360 "packuswb %%mm1, %%mm0 \n\t"
361 "packuswb %%mm3, %%mm2 \n\t"
362 "movq %%mm0, %0 \n\t"
363 "movq %%mm2, %1 \n\t"
364 : "+m"(*pix), "+m"(*(pix + line_size))
367 pix += line_size * 2;
372 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
373 int line_size, int h)
376 "lea (%3, %3), %%"REG_a" \n\t"
379 "movd (%1 ), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
385 "movd (%1 ), %%mm0 \n\t"
386 "movd (%1, %3), %%mm1 \n\t"
387 "movd %%mm0, (%2) \n\t"
388 "movd %%mm1, (%2, %3) \n\t"
389 "add %%"REG_a", %1 \n\t"
390 "add %%"REG_a", %2 \n\t"
393 : "+g"(h), "+r"(pixels), "+r"(block)
394 : "r"((x86_reg)line_size)
399 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
400 int line_size, int h)
403 "lea (%3, %3), %%"REG_a" \n\t"
406 "movq (%1 ), %%mm0 \n\t"
407 "movq (%1, %3), %%mm1 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm1, (%2, %3) \n\t"
410 "add %%"REG_a", %1 \n\t"
411 "add %%"REG_a", %2 \n\t"
412 "movq (%1 ), %%mm0 \n\t"
413 "movq (%1, %3), %%mm1 \n\t"
414 "movq %%mm0, (%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "add %%"REG_a", %1 \n\t"
417 "add %%"REG_a", %2 \n\t"
420 : "+g"(h), "+r"(pixels), "+r"(block)
421 : "r"((x86_reg)line_size)
426 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
427 int line_size, int h)
430 "lea (%3, %3), %%"REG_a" \n\t"
433 "movq (%1 ), %%mm0 \n\t"
434 "movq 8(%1 ), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
443 "movq (%1 ), %%mm0 \n\t"
444 "movq 8(%1 ), %%mm4 \n\t"
445 "movq (%1, %3), %%mm1 \n\t"
446 "movq 8(%1, %3), %%mm5 \n\t"
447 "movq %%mm0, (%2) \n\t"
448 "movq %%mm4, 8(%2) \n\t"
449 "movq %%mm1, (%2, %3) \n\t"
450 "movq %%mm5, 8(%2, %3) \n\t"
451 "add %%"REG_a", %1 \n\t"
452 "add %%"REG_a", %2 \n\t"
455 : "+g"(h), "+r"(pixels), "+r"(block)
456 : "r"((x86_reg)line_size)
461 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
462 int line_size, int h)
466 "movdqu (%1 ), %%xmm0 \n\t"
467 "movdqu (%1, %3 ), %%xmm1 \n\t"
468 "movdqu (%1, %3, 2), %%xmm2 \n\t"
469 "movdqu (%1, %4 ), %%xmm3 \n\t"
470 "lea (%1, %3, 4), %1 \n\t"
471 "movdqa %%xmm0, (%2) \n\t"
472 "movdqa %%xmm1, (%2, %3) \n\t"
473 "movdqa %%xmm2, (%2, %3, 2) \n\t"
474 "movdqa %%xmm3, (%2, %4) \n\t"
476 "lea (%2, %3, 4), %2 \n\t"
478 : "+g"(h), "+r"(pixels), "+r"(block)
479 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
484 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
485 int line_size, int h)
489 "movdqu (%1 ), %%xmm0 \n\t"
490 "movdqu (%1, %3 ), %%xmm1 \n\t"
491 "movdqu (%1, %3, 2), %%xmm2 \n\t"
492 "movdqu (%1, %4 ), %%xmm3 \n\t"
493 "lea (%1, %3, 4), %1 \n\t"
494 "pavgb (%2 ), %%xmm0 \n\t"
495 "pavgb (%2, %3 ), %%xmm1 \n\t"
496 "pavgb (%2, %3, 2), %%xmm2 \n\t"
497 "pavgb (%2, %4), %%xmm3 \n\t"
498 "movdqa %%xmm0, (%2) \n\t"
499 "movdqa %%xmm1, (%2, %3) \n\t"
500 "movdqa %%xmm2, (%2, %3, 2) \n\t"
501 "movdqa %%xmm3, (%2, %4) \n\t"
503 "lea (%2, %3, 4), %2 \n\t"
505 : "+g"(h), "+r"(pixels), "+r"(block)
506 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
511 #define CLEAR_BLOCKS(name, n) \
512 static void name(DCTELEM *blocks) \
515 "pxor %%mm7, %%mm7 \n\t" \
516 "mov %1, %%"REG_a" \n\t" \
518 "movq %%mm7, (%0, %%"REG_a") \n\t" \
519 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
522 "add $32, %%"REG_a" \n\t" \
524 :: "r"(((uint8_t *)blocks) + 128 * n), \
529 CLEAR_BLOCKS(clear_blocks_mmx, 6)
530 CLEAR_BLOCKS(clear_block_mmx, 1)
532 static void clear_block_sse(DCTELEM *block)
535 "xorps %%xmm0, %%xmm0 \n"
536 "movaps %%xmm0, (%0) \n"
537 "movaps %%xmm0, 16(%0) \n"
538 "movaps %%xmm0, 32(%0) \n"
539 "movaps %%xmm0, 48(%0) \n"
540 "movaps %%xmm0, 64(%0) \n"
541 "movaps %%xmm0, 80(%0) \n"
542 "movaps %%xmm0, 96(%0) \n"
543 "movaps %%xmm0, 112(%0) \n"
549 static void clear_blocks_sse(DCTELEM *blocks)
552 "xorps %%xmm0, %%xmm0 \n"
553 "mov %1, %%"REG_a" \n"
555 "movaps %%xmm0, (%0, %%"REG_a") \n"
556 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
557 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
563 "add $128, %%"REG_a" \n"
565 :: "r"(((uint8_t *)blocks) + 128 * 6),
571 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
577 "movq (%1, %0), %%mm0 \n\t"
578 "movq (%2, %0), %%mm1 \n\t"
579 "paddb %%mm0, %%mm1 \n\t"
580 "movq %%mm1, (%2, %0) \n\t"
581 "movq 8(%1, %0), %%mm0 \n\t"
582 "movq 8(%2, %0), %%mm1 \n\t"
583 "paddb %%mm0, %%mm1 \n\t"
584 "movq %%mm1, 8(%2, %0) \n\t"
590 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
593 dst[i + 0] += src[i + 0];
597 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
598 const uint8_t *diff, int w,
599 int *left, int *left_top)
603 int l = *left & 0xff;
604 int tl = *left_top & 0xff;
609 "movzbl (%3, %4), %2 \n"
622 "add (%6, %4), %b0 \n"
623 "mov %b0, (%5, %4) \n"
626 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
627 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
634 #define H263_LOOP_FILTER \
635 "pxor %%mm7, %%mm7 \n\t" \
636 "movq %0, %%mm0 \n\t" \
637 "movq %0, %%mm1 \n\t" \
638 "movq %3, %%mm2 \n\t" \
639 "movq %3, %%mm3 \n\t" \
640 "punpcklbw %%mm7, %%mm0 \n\t" \
641 "punpckhbw %%mm7, %%mm1 \n\t" \
642 "punpcklbw %%mm7, %%mm2 \n\t" \
643 "punpckhbw %%mm7, %%mm3 \n\t" \
644 "psubw %%mm2, %%mm0 \n\t" \
645 "psubw %%mm3, %%mm1 \n\t" \
646 "movq %1, %%mm2 \n\t" \
647 "movq %1, %%mm3 \n\t" \
648 "movq %2, %%mm4 \n\t" \
649 "movq %2, %%mm5 \n\t" \
650 "punpcklbw %%mm7, %%mm2 \n\t" \
651 "punpckhbw %%mm7, %%mm3 \n\t" \
652 "punpcklbw %%mm7, %%mm4 \n\t" \
653 "punpckhbw %%mm7, %%mm5 \n\t" \
654 "psubw %%mm2, %%mm4 \n\t" \
655 "psubw %%mm3, %%mm5 \n\t" \
656 "psllw $2, %%mm4 \n\t" \
657 "psllw $2, %%mm5 \n\t" \
658 "paddw %%mm0, %%mm4 \n\t" \
659 "paddw %%mm1, %%mm5 \n\t" \
660 "pxor %%mm6, %%mm6 \n\t" \
661 "pcmpgtw %%mm4, %%mm6 \n\t" \
662 "pcmpgtw %%mm5, %%mm7 \n\t" \
663 "pxor %%mm6, %%mm4 \n\t" \
664 "pxor %%mm7, %%mm5 \n\t" \
665 "psubw %%mm6, %%mm4 \n\t" \
666 "psubw %%mm7, %%mm5 \n\t" \
667 "psrlw $3, %%mm4 \n\t" \
668 "psrlw $3, %%mm5 \n\t" \
669 "packuswb %%mm5, %%mm4 \n\t" \
670 "packsswb %%mm7, %%mm6 \n\t" \
671 "pxor %%mm7, %%mm7 \n\t" \
672 "movd %4, %%mm2 \n\t" \
673 "punpcklbw %%mm2, %%mm2 \n\t" \
674 "punpcklbw %%mm2, %%mm2 \n\t" \
675 "punpcklbw %%mm2, %%mm2 \n\t" \
676 "psubusb %%mm4, %%mm2 \n\t" \
677 "movq %%mm2, %%mm3 \n\t" \
678 "psubusb %%mm4, %%mm3 \n\t" \
679 "psubb %%mm3, %%mm2 \n\t" \
680 "movq %1, %%mm3 \n\t" \
681 "movq %2, %%mm4 \n\t" \
682 "pxor %%mm6, %%mm3 \n\t" \
683 "pxor %%mm6, %%mm4 \n\t" \
684 "paddusb %%mm2, %%mm3 \n\t" \
685 "psubusb %%mm2, %%mm4 \n\t" \
686 "pxor %%mm6, %%mm3 \n\t" \
687 "pxor %%mm6, %%mm4 \n\t" \
688 "paddusb %%mm2, %%mm2 \n\t" \
689 "packsswb %%mm1, %%mm0 \n\t" \
690 "pcmpgtb %%mm0, %%mm7 \n\t" \
691 "pxor %%mm7, %%mm0 \n\t" \
692 "psubb %%mm7, %%mm0 \n\t" \
693 "movq %%mm0, %%mm1 \n\t" \
694 "psubusb %%mm2, %%mm0 \n\t" \
695 "psubb %%mm0, %%mm1 \n\t" \
696 "pand %5, %%mm1 \n\t" \
697 "psrlw $2, %%mm1 \n\t" \
698 "pxor %%mm7, %%mm1 \n\t" \
699 "psubb %%mm7, %%mm1 \n\t" \
700 "movq %0, %%mm5 \n\t" \
701 "movq %3, %%mm6 \n\t" \
702 "psubb %%mm1, %%mm5 \n\t" \
703 "paddb %%mm1, %%mm6 \n\t"
705 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
707 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
708 const int strength = ff_h263_loop_filter_strength[qscale];
713 "movq %%mm3, %1 \n\t"
714 "movq %%mm4, %2 \n\t"
715 "movq %%mm5, %0 \n\t"
716 "movq %%mm6, %3 \n\t"
717 : "+m"(*(uint64_t*)(src - 2 * stride)),
718 "+m"(*(uint64_t*)(src - 1 * stride)),
719 "+m"(*(uint64_t*)(src + 0 * stride)),
720 "+m"(*(uint64_t*)(src + 1 * stride))
721 : "g"(2 * strength), "m"(ff_pb_FC)
726 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
728 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
729 const int strength = ff_h263_loop_filter_strength[qscale];
730 DECLARE_ALIGNED(8, uint64_t, temp)[4];
731 uint8_t *btemp = (uint8_t*)temp;
735 transpose4x4(btemp, src, 8, stride);
736 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
738 H263_LOOP_FILTER // 5 3 4 6
744 : "g"(2 * strength), "m"(ff_pb_FC)
748 "movq %%mm5, %%mm1 \n\t"
749 "movq %%mm4, %%mm0 \n\t"
750 "punpcklbw %%mm3, %%mm5 \n\t"
751 "punpcklbw %%mm6, %%mm4 \n\t"
752 "punpckhbw %%mm3, %%mm1 \n\t"
753 "punpckhbw %%mm6, %%mm0 \n\t"
754 "movq %%mm5, %%mm3 \n\t"
755 "movq %%mm1, %%mm6 \n\t"
756 "punpcklwd %%mm4, %%mm5 \n\t"
757 "punpcklwd %%mm0, %%mm1 \n\t"
758 "punpckhwd %%mm4, %%mm3 \n\t"
759 "punpckhwd %%mm0, %%mm6 \n\t"
760 "movd %%mm5, (%0) \n\t"
761 "punpckhdq %%mm5, %%mm5 \n\t"
762 "movd %%mm5, (%0, %2) \n\t"
763 "movd %%mm3, (%0, %2, 2) \n\t"
764 "punpckhdq %%mm3, %%mm3 \n\t"
765 "movd %%mm3, (%0, %3) \n\t"
766 "movd %%mm1, (%1) \n\t"
767 "punpckhdq %%mm1, %%mm1 \n\t"
768 "movd %%mm1, (%1, %2) \n\t"
769 "movd %%mm6, (%1, %2, 2) \n\t"
770 "punpckhdq %%mm6, %%mm6 \n\t"
771 "movd %%mm6, (%1, %3) \n\t"
773 "r"(src + 4 * stride),
774 "r"((x86_reg)stride),
775 "r"((x86_reg)(3 * stride))
780 /* Draw the edges of width 'w' of an image of size width, height
781 * this MMX version can only handle w == 8 || w == 16. */
782 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
783 int w, int h, int sides)
785 uint8_t *ptr, *last_line;
788 last_line = buf + (height - 1) * wrap;
794 "movd (%0), %%mm0 \n\t"
795 "punpcklbw %%mm0, %%mm0 \n\t"
796 "punpcklwd %%mm0, %%mm0 \n\t"
797 "punpckldq %%mm0, %%mm0 \n\t"
798 "movq %%mm0, -8(%0) \n\t"
799 "movq -8(%0, %2), %%mm1 \n\t"
800 "punpckhbw %%mm1, %%mm1 \n\t"
801 "punpckhwd %%mm1, %%mm1 \n\t"
802 "punpckhdq %%mm1, %%mm1 \n\t"
803 "movq %%mm1, (%0, %2) \n\t"
808 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
813 "movd (%0), %%mm0 \n\t"
814 "punpcklbw %%mm0, %%mm0 \n\t"
815 "punpcklwd %%mm0, %%mm0 \n\t"
816 "punpckldq %%mm0, %%mm0 \n\t"
817 "movq %%mm0, -8(%0) \n\t"
818 "movq %%mm0, -16(%0) \n\t"
819 "movq -8(%0, %2), %%mm1 \n\t"
820 "punpckhbw %%mm1, %%mm1 \n\t"
821 "punpckhwd %%mm1, %%mm1 \n\t"
822 "punpckhdq %%mm1, %%mm1 \n\t"
823 "movq %%mm1, (%0, %2) \n\t"
824 "movq %%mm1, 8(%0, %2) \n\t"
829 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
833 /* top and bottom (and hopefully also the corners) */
834 if (sides & EDGE_TOP) {
835 for (i = 0; i < h; i += 4) {
836 ptr = buf - (i + 1) * wrap - w;
839 "movq (%1, %0), %%mm0 \n\t"
840 "movq %%mm0, (%0) \n\t"
841 "movq %%mm0, (%0, %2) \n\t"
842 "movq %%mm0, (%0, %2, 2) \n\t"
843 "movq %%mm0, (%0, %3) \n\t"
848 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
849 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
854 if (sides & EDGE_BOTTOM) {
855 for (i = 0; i < h; i += 4) {
856 ptr = last_line + (i + 1) * wrap - w;
859 "movq (%1, %0), %%mm0 \n\t"
860 "movq %%mm0, (%0) \n\t"
861 "movq %%mm0, (%0, %2) \n\t"
862 "movq %%mm0, (%0, %2, 2) \n\t"
863 "movq %%mm0, (%0, %3) \n\t"
868 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
869 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
870 "r"(ptr + width + 2 * w)
876 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
877 in0, in1, in2, in7, out, OP) \
878 "paddw "#m4", "#m3" \n\t" /* x1 */ \
879 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
880 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
881 "movq "#in7", "#m3" \n\t" /* d */ \
882 "movq "#in0", %%mm5 \n\t" /* D */ \
883 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
884 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
885 "movq "#in1", %%mm5 \n\t" /* C */ \
886 "movq "#in2", %%mm6 \n\t" /* B */ \
887 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
888 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
889 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
890 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
891 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
892 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
893 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
894 "psraw $5, %%mm5 \n\t" \
895 "packuswb %%mm5, %%mm5 \n\t" \
896 OP(%%mm5, out, %%mm7, d)
898 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
899 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
908 "pxor %%mm7, %%mm7 \n\t" \
910 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
911 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
912 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
913 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
914 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
915 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
916 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
917 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
918 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
919 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
920 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
921 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
922 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
923 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
924 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
925 "paddw %%mm3, %%mm5 \n\t" /* b */ \
926 "paddw %%mm2, %%mm6 \n\t" /* c */ \
927 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
928 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
929 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
930 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
931 "paddw %%mm4, %%mm0 \n\t" /* a */ \
932 "paddw %%mm1, %%mm5 \n\t" /* d */ \
933 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
934 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
935 "paddw %6, %%mm6 \n\t" \
936 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
937 "psraw $5, %%mm0 \n\t" \
938 "movq %%mm0, %5 \n\t" \
939 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
941 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
942 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
943 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
944 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
945 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
946 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
947 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
948 "paddw %%mm0, %%mm2 \n\t" /* b */ \
949 "paddw %%mm5, %%mm3 \n\t" /* c */ \
950 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
951 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
952 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
953 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
954 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
955 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
956 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
957 "paddw %%mm2, %%mm1 \n\t" /* a */ \
958 "paddw %%mm6, %%mm4 \n\t" /* d */ \
959 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
960 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
961 "paddw %6, %%mm1 \n\t" \
962 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
963 "psraw $5, %%mm3 \n\t" \
964 "movq %5, %%mm1 \n\t" \
965 "packuswb %%mm3, %%mm1 \n\t" \
966 OP_MMX2(%%mm1, (%1), %%mm4, q) \
967 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
969 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
970 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
971 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
972 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
973 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
974 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
975 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
976 "paddw %%mm1, %%mm5 \n\t" /* b */ \
977 "paddw %%mm4, %%mm0 \n\t" /* c */ \
978 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
979 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
980 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
981 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
982 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
983 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
984 "paddw %%mm3, %%mm2 \n\t" /* d */ \
985 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
986 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
987 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
988 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
989 "paddw %%mm2, %%mm6 \n\t" /* a */ \
990 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
991 "paddw %6, %%mm0 \n\t" \
992 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
993 "psraw $5, %%mm0 \n\t" \
994 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
995 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
997 "paddw %%mm5, %%mm3 \n\t" /* a */ \
998 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
999 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1000 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1001 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1002 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1003 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1004 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1005 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1006 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1007 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1008 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1009 "paddw %6, %%mm4 \n\t" \
1010 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1011 "psraw $5, %%mm4 \n\t" \
1012 "packuswb %%mm4, %%mm0 \n\t" \
1013 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1019 : "+a"(src), "+c"(dst), "+D"(h) \
1020 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1021 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1026 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1034 /* quick HACK, XXX FIXME MUST be optimized */ \
1035 for (i = 0; i < h; i++) { \
1036 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1037 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1038 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1039 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1040 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1041 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1042 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1043 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1044 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1045 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1046 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1047 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1048 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1049 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1050 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1051 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1052 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1053 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1054 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1055 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1056 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1057 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1058 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1059 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1060 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1061 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1062 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1063 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1064 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1065 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1066 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1067 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1068 __asm__ volatile ( \
1069 "movq (%0), %%mm0 \n\t" \
1070 "movq 8(%0), %%mm1 \n\t" \
1071 "paddw %2, %%mm0 \n\t" \
1072 "paddw %2, %%mm1 \n\t" \
1073 "psraw $5, %%mm0 \n\t" \
1074 "psraw $5, %%mm1 \n\t" \
1075 "packuswb %%mm1, %%mm0 \n\t" \
1076 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1077 "movq 16(%0), %%mm0 \n\t" \
1078 "movq 24(%0), %%mm1 \n\t" \
1079 "paddw %2, %%mm0 \n\t" \
1080 "paddw %2, %%mm1 \n\t" \
1081 "psraw $5, %%mm0 \n\t" \
1082 "psraw $5, %%mm1 \n\t" \
1083 "packuswb %%mm1, %%mm0 \n\t" \
1084 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1085 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1093 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1099 __asm__ volatile ( \
1100 "pxor %%mm7, %%mm7 \n\t" \
1102 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1103 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1104 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1105 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1106 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1107 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1108 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1109 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1110 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1111 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1112 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1113 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1114 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1115 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1116 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1117 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1118 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1119 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1120 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1121 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1122 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1123 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1124 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1125 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1126 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1127 "paddw %5, %%mm6 \n\t" \
1128 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1129 "psraw $5, %%mm0 \n\t" \
1130 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1132 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1133 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1134 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1135 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1136 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1137 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1138 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1139 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1140 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1141 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1142 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1143 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1144 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1145 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1146 "paddw %5, %%mm1 \n\t" \
1147 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1148 "psraw $5, %%mm3 \n\t" \
1149 "packuswb %%mm3, %%mm0 \n\t" \
1150 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1156 : "+a"(src), "+c"(dst), "+d"(h) \
1157 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1158 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1163 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1171 /* quick HACK, XXX FIXME MUST be optimized */ \
1172 for (i = 0; i < h; i++) { \
1173 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1174 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1175 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1176 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1177 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1178 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1179 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1180 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1181 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1182 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1183 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1184 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1185 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1186 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1187 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1188 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1189 __asm__ volatile ( \
1190 "movq (%0), %%mm0 \n\t" \
1191 "movq 8(%0), %%mm1 \n\t" \
1192 "paddw %2, %%mm0 \n\t" \
1193 "paddw %2, %%mm1 \n\t" \
1194 "psraw $5, %%mm0 \n\t" \
1195 "psraw $5, %%mm1 \n\t" \
1196 "packuswb %%mm1, %%mm0 \n\t" \
1197 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1198 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1206 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1207 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1212 uint64_t temp[17 * 4]; \
1213 uint64_t *temp_ptr = temp; \
1216 /* FIXME unroll */ \
1217 __asm__ volatile ( \
1218 "pxor %%mm7, %%mm7 \n\t" \
1220 "movq (%0), %%mm0 \n\t" \
1221 "movq (%0), %%mm1 \n\t" \
1222 "movq 8(%0), %%mm2 \n\t" \
1223 "movq 8(%0), %%mm3 \n\t" \
1224 "punpcklbw %%mm7, %%mm0 \n\t" \
1225 "punpckhbw %%mm7, %%mm1 \n\t" \
1226 "punpcklbw %%mm7, %%mm2 \n\t" \
1227 "punpckhbw %%mm7, %%mm3 \n\t" \
1228 "movq %%mm0, (%1) \n\t" \
1229 "movq %%mm1, 17 * 8(%1) \n\t" \
1230 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1231 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1236 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1237 : "r"((x86_reg)srcStride) \
1244 /* FIXME reorder for speed */ \
1245 __asm__ volatile ( \
1246 /* "pxor %%mm7, %%mm7 \n\t" */ \
1248 "movq (%0), %%mm0 \n\t" \
1249 "movq 8(%0), %%mm1 \n\t" \
1250 "movq 16(%0), %%mm2 \n\t" \
1251 "movq 24(%0), %%mm3 \n\t" \
1252 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1253 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1255 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1257 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1259 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1260 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1262 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1263 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1265 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1266 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1268 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1269 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1271 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1273 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1275 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1276 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1278 "add $136, %0 \n\t" \
1283 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1284 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1285 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1286 "g"(4 - 14 * (x86_reg)dstStride) \
1291 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1296 uint64_t temp[9 * 2]; \
1297 uint64_t *temp_ptr = temp; \
1300 /* FIXME unroll */ \
1301 __asm__ volatile ( \
1302 "pxor %%mm7, %%mm7 \n\t" \
1304 "movq (%0), %%mm0 \n\t" \
1305 "movq (%0), %%mm1 \n\t" \
1306 "punpcklbw %%mm7, %%mm0 \n\t" \
1307 "punpckhbw %%mm7, %%mm1 \n\t" \
1308 "movq %%mm0, (%1) \n\t" \
1309 "movq %%mm1, 9*8(%1) \n\t" \
1314 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1315 : "r"((x86_reg)srcStride) \
1322 /* FIXME reorder for speed */ \
1323 __asm__ volatile ( \
1324 /* "pxor %%mm7, %%mm7 \n\t" */ \
1326 "movq (%0), %%mm0 \n\t" \
1327 "movq 8(%0), %%mm1 \n\t" \
1328 "movq 16(%0), %%mm2 \n\t" \
1329 "movq 24(%0), %%mm3 \n\t" \
1330 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1331 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1333 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1335 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1337 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1339 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1341 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1342 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1344 "add $72, %0 \n\t" \
1349 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1350 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1351 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1352 "g"(4 - 6 * (x86_reg)dstStride) \
1357 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1360 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1363 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1367 uint8_t * const half = (uint8_t*)temp; \
1368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1370 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1373 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1376 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1380 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1384 uint8_t * const half = (uint8_t*)temp; \
1385 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1387 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1391 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1395 uint8_t * const half = (uint8_t*)temp; \
1396 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1397 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1400 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1403 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1406 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1410 uint8_t * const half = (uint8_t*)temp; \
1411 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1412 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1416 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1419 uint64_t half[8 + 9]; \
1420 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1421 uint8_t * const halfHV = ((uint8_t*)half); \
1422 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1424 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1429 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1432 uint64_t half[8 + 9]; \
1433 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1434 uint8_t * const halfHV = ((uint8_t*)half); \
1435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1437 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1439 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1440 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1443 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1446 uint64_t half[8 + 9]; \
1447 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1448 uint8_t * const halfHV = ((uint8_t*)half); \
1449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1451 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1456 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint64_t half[8 + 9]; \
1460 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1461 uint8_t * const halfHV = ((uint8_t*)half); \
1462 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1464 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1466 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1467 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1470 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1473 uint64_t half[8 + 9]; \
1474 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1475 uint8_t * const halfHV = ((uint8_t*)half); \
1476 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1478 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1479 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1482 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1485 uint64_t half[8 + 9]; \
1486 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1487 uint8_t * const halfHV = ((uint8_t*)half); \
1488 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1490 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1491 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1494 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1497 uint64_t half[8 + 9]; \
1498 uint8_t * const halfH = ((uint8_t*)half); \
1499 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1501 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1502 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1505 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1508 uint64_t half[8 + 9]; \
1509 uint8_t * const halfH = ((uint8_t*)half); \
1510 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1512 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1514 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1517 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1521 uint8_t * const halfH = ((uint8_t*)half); \
1522 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1524 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1527 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1530 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1533 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1536 uint64_t temp[32]; \
1537 uint8_t * const half = (uint8_t*)temp; \
1538 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1540 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1543 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1546 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1547 stride, stride, 16); \
1550 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1553 uint64_t temp[32]; \
1554 uint8_t * const half = (uint8_t*)temp; \
1555 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1557 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1558 stride, stride, 16); \
1561 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1564 uint64_t temp[32]; \
1565 uint8_t * const half = (uint8_t*)temp; \
1566 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1568 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1571 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1574 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1577 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1580 uint64_t temp[32]; \
1581 uint8_t * const half = (uint8_t*)temp; \
1582 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1584 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1585 stride, stride, 16); \
1588 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1591 uint64_t half[16 * 2 + 17 * 2]; \
1592 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1593 uint8_t * const halfHV = ((uint8_t*)half); \
1594 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1596 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1598 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1600 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1603 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1606 uint64_t half[16 * 2 + 17 * 2]; \
1607 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1608 uint8_t * const halfHV = ((uint8_t*)half); \
1609 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1611 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1613 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1615 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1618 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1621 uint64_t half[16 * 2 + 17 * 2]; \
1622 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1623 uint8_t * const halfHV = ((uint8_t*)half); \
1624 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1626 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1628 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1630 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1634 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1637 uint64_t half[16 * 2 + 17 * 2]; \
1638 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1639 uint8_t * const halfHV = ((uint8_t*)half); \
1640 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1642 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1644 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1646 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1650 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1653 uint64_t half[16 * 2 + 17 * 2]; \
1654 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1655 uint8_t * const halfHV = ((uint8_t*)half); \
1656 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1658 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1660 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1663 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1666 uint64_t half[16 * 2 + 17 * 2]; \
1667 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1668 uint8_t * const halfHV = ((uint8_t*)half); \
1669 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1671 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1673 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1677 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1680 uint64_t half[17 * 2]; \
1681 uint8_t * const halfH = ((uint8_t*)half); \
1682 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1684 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1686 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1689 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1692 uint64_t half[17 * 2]; \
1693 uint8_t * const halfH = ((uint8_t*)half); \
1694 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1696 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1698 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1701 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1704 uint64_t half[17 * 2]; \
1705 uint8_t * const halfH = ((uint8_t*)half); \
1706 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1708 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1711 #define PUT_OP(a, b, temp, size) \
1712 "mov"#size" "#a", "#b" \n\t"
1714 #define AVG_3DNOW_OP(a, b, temp, size) \
1715 "mov"#size" "#b", "#temp" \n\t" \
1716 "pavgusb "#temp", "#a" \n\t" \
1717 "mov"#size" "#a", "#b" \n\t"
1719 #define AVG_MMX2_OP(a, b, temp, size) \
1720 "mov"#size" "#b", "#temp" \n\t" \
1721 "pavgb "#temp", "#a" \n\t" \
1722 "mov"#size" "#a", "#b" \n\t"
1724 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1725 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1726 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1727 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1728 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1729 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1730 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1731 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1732 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1734 /***********************************/
1735 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1737 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1738 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1742 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1745 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1746 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1750 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1754 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1755 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1756 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1757 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1758 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1759 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1760 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1761 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1762 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1763 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1764 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1768 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1770 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1774 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1777 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1778 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1779 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1780 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1781 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1782 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1783 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1784 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1786 QPEL_2TAP(put_, 16, mmx2)
1787 QPEL_2TAP(avg_, 16, mmx2)
1788 QPEL_2TAP(put_, 8, mmx2)
1789 QPEL_2TAP(avg_, 8, mmx2)
1790 QPEL_2TAP(put_, 16, 3dnow)
1791 QPEL_2TAP(avg_, 16, 3dnow)
1792 QPEL_2TAP(put_, 8, 3dnow)
1793 QPEL_2TAP(avg_, 8, 3dnow)
1795 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1797 put_pixels8_xy2_mmx(dst, src, stride, 8);
1799 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1801 put_pixels16_xy2_mmx(dst, src, stride, 16);
1803 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1805 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1807 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1809 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1813 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1814 x86_reg linesize, x86_reg start_y,
1815 x86_reg end_y, x86_reg block_h,
1816 x86_reg start_x, x86_reg end_x,
1818 extern emu_edge_core_func ff_emu_edge_core_mmx;
1819 extern emu_edge_core_func ff_emu_edge_core_sse;
1821 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1823 int block_w, int block_h,
1824 int src_x, int src_y,
1826 emu_edge_core_func *core_fn)
1828 int start_y, start_x, end_y, end_x, src_y_add = 0;
1831 src_y_add = h - 1 - src_y;
1833 } else if (src_y <= -block_h) {
1834 src_y_add = 1 - block_h - src_y;
1835 src_y = 1 - block_h;
1838 src += w - 1 - src_x;
1840 } else if (src_x <= -block_w) {
1841 src += 1 - block_w - src_x;
1842 src_x = 1 - block_w;
1845 start_y = FFMAX(0, -src_y);
1846 start_x = FFMAX(0, -src_x);
1847 end_y = FFMIN(block_h, h-src_y);
1848 end_x = FFMIN(block_w, w-src_x);
1849 assert(start_x < end_x && block_w > 0);
1850 assert(start_y < end_y && block_h > 0);
1852 // fill in the to-be-copied part plus all above/below
1853 src += (src_y_add + start_y) * linesize + start_x;
1855 core_fn(buf, src, linesize, start_y, end_y,
1856 block_h, start_x, end_x, block_w);
1860 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1862 int block_w, int block_h,
1863 int src_x, int src_y, int w, int h)
1865 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1866 w, h, &ff_emu_edge_core_mmx);
1870 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1872 int block_w, int block_h,
1873 int src_x, int src_y, int w, int h)
1875 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1876 w, h, &ff_emu_edge_core_sse);
1878 #endif /* HAVE_YASM */
1880 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1881 int linesize, int block_w, int block_h,
1882 int src_x, int src_y, int w, int h);
1884 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1885 int stride, int h, int ox, int oy,
1886 int dxx, int dxy, int dyx, int dyy,
1887 int shift, int r, int width, int height,
1888 emulated_edge_mc_func *emu_edge_fn)
1891 const int ix = ox >> (16 + shift);
1892 const int iy = oy >> (16 + shift);
1893 const int oxs = ox >> 4;
1894 const int oys = oy >> 4;
1895 const int dxxs = dxx >> 4;
1896 const int dxys = dxy >> 4;
1897 const int dyxs = dyx >> 4;
1898 const int dyys = dyy >> 4;
1899 const uint16_t r4[4] = { r, r, r, r };
1900 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1901 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1902 const uint64_t shift2 = 2 * shift;
1903 uint8_t edge_buf[(h + 1) * stride];
1906 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1907 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1908 const int dxh = dxy * (h - 1);
1909 const int dyw = dyx * (w - 1);
1910 if ( // non-constant fullpel offset (3% of blocks)
1911 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1912 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1913 // uses more than 16 bits of subpel mv (only at huge resolution)
1914 || (dxx | dxy | dyx | dyy) & 15) {
1915 // FIXME could still use mmx for some of the rows
1916 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1917 shift, r, width, height);
1921 src += ix + iy * stride;
1922 if ((unsigned)ix >= width - w ||
1923 (unsigned)iy >= height - h) {
1924 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1929 "movd %0, %%mm6 \n\t"
1930 "pxor %%mm7, %%mm7 \n\t"
1931 "punpcklwd %%mm6, %%mm6 \n\t"
1932 "punpcklwd %%mm6, %%mm6 \n\t"
1936 for (x = 0; x < w; x += 4) {
1937 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1938 oxs - dxys + dxxs * (x + 1),
1939 oxs - dxys + dxxs * (x + 2),
1940 oxs - dxys + dxxs * (x + 3) };
1941 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1942 oys - dyys + dyxs * (x + 1),
1943 oys - dyys + dyxs * (x + 2),
1944 oys - dyys + dyxs * (x + 3) };
1946 for (y = 0; y < h; y++) {
1948 "movq %0, %%mm4 \n\t"
1949 "movq %1, %%mm5 \n\t"
1950 "paddw %2, %%mm4 \n\t"
1951 "paddw %3, %%mm5 \n\t"
1952 "movq %%mm4, %0 \n\t"
1953 "movq %%mm5, %1 \n\t"
1954 "psrlw $12, %%mm4 \n\t"
1955 "psrlw $12, %%mm5 \n\t"
1956 : "+m"(*dx4), "+m"(*dy4)
1957 : "m"(*dxy4), "m"(*dyy4)
1961 "movq %%mm6, %%mm2 \n\t"
1962 "movq %%mm6, %%mm1 \n\t"
1963 "psubw %%mm4, %%mm2 \n\t"
1964 "psubw %%mm5, %%mm1 \n\t"
1965 "movq %%mm2, %%mm0 \n\t"
1966 "movq %%mm4, %%mm3 \n\t"
1967 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1968 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1969 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1970 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1972 "movd %4, %%mm5 \n\t"
1973 "movd %3, %%mm4 \n\t"
1974 "punpcklbw %%mm7, %%mm5 \n\t"
1975 "punpcklbw %%mm7, %%mm4 \n\t"
1976 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1977 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1979 "movd %2, %%mm5 \n\t"
1980 "movd %1, %%mm4 \n\t"
1981 "punpcklbw %%mm7, %%mm5 \n\t"
1982 "punpcklbw %%mm7, %%mm4 \n\t"
1983 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1984 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1985 "paddw %5, %%mm1 \n\t"
1986 "paddw %%mm3, %%mm2 \n\t"
1987 "paddw %%mm1, %%mm0 \n\t"
1988 "paddw %%mm2, %%mm0 \n\t"
1990 "psrlw %6, %%mm0 \n\t"
1991 "packuswb %%mm0, %%mm0 \n\t"
1992 "movd %%mm0, %0 \n\t"
1994 : "=m"(dst[x + y * stride])
1995 : "m"(src[0]), "m"(src[1]),
1996 "m"(src[stride]), "m"(src[stride + 1]),
1997 "m"(*r4), "m"(shift2)
2001 src += 4 - h * stride;
2007 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2008 int stride, int h, int ox, int oy,
2009 int dxx, int dxy, int dyx, int dyy,
2010 int shift, int r, int width, int height)
2012 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2013 width, height, &emulated_edge_mc_mmx);
2016 static void gmc_sse(uint8_t *dst, uint8_t *src,
2017 int stride, int h, int ox, int oy,
2018 int dxx, int dxy, int dyx, int dyy,
2019 int shift, int r, int width, int height)
2021 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2022 width, height, &emulated_edge_mc_sse);
2025 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2026 int stride, int h, int ox, int oy,
2027 int dxx, int dxy, int dyx, int dyy,
2028 int shift, int r, int width, int height)
2030 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2031 width, height, &ff_emulated_edge_mc_8);
2035 #define PREFETCH(name, op) \
2036 static void name(void *mem, int stride, int h) \
2038 const uint8_t *p = mem; \
2040 __asm__ volatile (#op" %0" :: "m"(*p)); \
2045 PREFETCH(prefetch_mmx2, prefetcht0)
2046 PREFETCH(prefetch_3dnow, prefetch)
2049 #include "h264_qpel_mmx.c"
2051 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2052 int stride, int h, int x, int y);
2053 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2054 int stride, int h, int x, int y);
2055 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2056 int stride, int h, int x, int y);
2058 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2059 int stride, int h, int x, int y);
2060 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2061 int stride, int h, int x, int y);
2062 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2063 int stride, int h, int x, int y);
2065 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2066 int stride, int h, int x, int y);
2067 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2068 int stride, int h, int x, int y);
2070 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2071 int stride, int h, int x, int y);
2072 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2073 int stride, int h, int x, int y);
2075 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2076 int stride, int h, int x, int y);
2077 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2078 int stride, int h, int x, int y);
2080 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2081 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2082 (uint8_t *dst, uint8_t *src, \
2083 int stride, int h, int x, int y);
2085 CHROMA_MC(put, 2, 10, mmxext)
2086 CHROMA_MC(avg, 2, 10, mmxext)
2087 CHROMA_MC(put, 4, 10, mmxext)
2088 CHROMA_MC(avg, 4, 10, mmxext)
2089 CHROMA_MC(put, 8, 10, sse2)
2090 CHROMA_MC(avg, 8, 10, sse2)
2091 CHROMA_MC(put, 8, 10, avx)
2092 CHROMA_MC(avg, 8, 10, avx)
2095 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2097 put_pixels8_mmx(dst, src, stride, 8);
2100 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2102 avg_pixels8_mmx(dst, src, stride, 8);
2105 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2107 put_pixels16_mmx(dst, src, stride, 16);
2110 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2112 avg_pixels16_mmx(dst, src, stride, 16);
2116 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2117 int stride, int rnd)
2119 put_pixels8_mmx(dst, src, stride, 8);
2122 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2123 int stride, int rnd)
2125 avg_pixels8_mmx2(dst, src, stride, 8);
2128 /* only used in VP3/5/6 */
2129 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2135 "movq (%1), %%mm0 \n\t"
2136 "movq (%2), %%mm1 \n\t"
2137 "movq (%1,%4), %%mm2 \n\t"
2138 "movq (%2,%4), %%mm3 \n\t"
2139 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2140 "movq %%mm4, (%3) \n\t"
2141 "movq %%mm5, (%3,%4) \n\t"
2143 "movq (%1,%4,2), %%mm0 \n\t"
2144 "movq (%2,%4,2), %%mm1 \n\t"
2145 "movq (%1,%5), %%mm2 \n\t"
2146 "movq (%2,%5), %%mm3 \n\t"
2147 "lea (%1,%4,4), %1 \n\t"
2148 "lea (%2,%4,4), %2 \n\t"
2149 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2150 "movq %%mm4, (%3,%4,2) \n\t"
2151 "movq %%mm5, (%3,%5) \n\t"
2152 "lea (%3,%4,4), %3 \n\t"
2155 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2156 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2158 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2160 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2162 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2163 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2166 #if CONFIG_DIRAC_DECODER
2167 #define DIRAC_PIXOP(OPNAME, EXT)\
2168 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2170 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2172 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2174 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2176 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2178 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
2179 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2182 DIRAC_PIXOP(put, mmx)
2183 DIRAC_PIXOP(avg, mmx)
2184 DIRAC_PIXOP(avg, mmx2)
2186 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2188 put_pixels16_sse2(dst, src[0], stride, h);
2190 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2192 avg_pixels16_sse2(dst, src[0], stride, h);
2194 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2196 put_pixels16_sse2(dst , src[0] , stride, h);
2197 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2199 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2201 avg_pixels16_sse2(dst , src[0] , stride, h);
2202 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2206 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2209 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2213 ff_put_pixels_clamped_mmx(block, dest, line_size);
2216 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2220 ff_add_pixels_clamped_mmx(block, dest, line_size);
2223 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2226 ff_mmxext_idct(block);
2227 ff_put_pixels_clamped_mmx(block, dest, line_size);
2230 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2233 ff_mmxext_idct(block);
2234 ff_add_pixels_clamped_mmx(block, dest, line_size);
2238 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2240 ff_idct_xvid_mmx(block);
2241 ff_put_pixels_clamped_mmx(block, dest, line_size);
2244 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2246 ff_idct_xvid_mmx(block);
2247 ff_add_pixels_clamped_mmx(block, dest, line_size);
2250 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2252 ff_idct_xvid_mmx2(block);
2253 ff_put_pixels_clamped_mmx(block, dest, line_size);
2256 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2258 ff_idct_xvid_mmx2(block);
2259 ff_add_pixels_clamped_mmx(block, dest, line_size);
2262 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2265 __asm__ volatile ("pxor %%mm7, %%mm7":);
2266 for (i = 0; i < blocksize; i += 2) {
2268 "movq %0, %%mm0 \n\t"
2269 "movq %1, %%mm1 \n\t"
2270 "movq %%mm0, %%mm2 \n\t"
2271 "movq %%mm1, %%mm3 \n\t"
2272 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2273 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2274 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2275 "pxor %%mm2, %%mm1 \n\t"
2276 "movq %%mm3, %%mm4 \n\t"
2277 "pand %%mm1, %%mm3 \n\t"
2278 "pandn %%mm1, %%mm4 \n\t"
2279 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2280 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2281 "movq %%mm3, %1 \n\t"
2282 "movq %%mm0, %0 \n\t"
2283 : "+m"(mag[i]), "+m"(ang[i])
2287 __asm__ volatile ("femms");
2290 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2295 "movaps %0, %%xmm5 \n\t"
2296 :: "m"(ff_pdw_80000000[0])
2298 for (i = 0; i < blocksize; i += 4) {
2300 "movaps %0, %%xmm0 \n\t"
2301 "movaps %1, %%xmm1 \n\t"
2302 "xorps %%xmm2, %%xmm2 \n\t"
2303 "xorps %%xmm3, %%xmm3 \n\t"
2304 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2305 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2306 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2307 "xorps %%xmm2, %%xmm1 \n\t"
2308 "movaps %%xmm3, %%xmm4 \n\t"
2309 "andps %%xmm1, %%xmm3 \n\t"
2310 "andnps %%xmm1, %%xmm4 \n\t"
2311 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2312 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2313 "movaps %%xmm3, %1 \n\t"
2314 "movaps %%xmm0, %0 \n\t"
2315 : "+m"(mag[i]), "+m"(ang[i])
2324 #define MIX5(mono, stereo) \
2325 __asm__ volatile ( \
2326 "movss 0(%2), %%xmm5 \n" \
2327 "movss 8(%2), %%xmm6 \n" \
2328 "movss 24(%2), %%xmm7 \n" \
2329 "shufps $0, %%xmm5, %%xmm5 \n" \
2330 "shufps $0, %%xmm6, %%xmm6 \n" \
2331 "shufps $0, %%xmm7, %%xmm7 \n" \
2333 "movaps (%0, %1), %%xmm0 \n" \
2334 "movaps 0x400(%0, %1), %%xmm1 \n" \
2335 "movaps 0x800(%0, %1), %%xmm2 \n" \
2336 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2337 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2338 "mulps %%xmm5, %%xmm0 \n" \
2339 "mulps %%xmm6, %%xmm1 \n" \
2340 "mulps %%xmm5, %%xmm2 \n" \
2341 "mulps %%xmm7, %%xmm3 \n" \
2342 "mulps %%xmm7, %%xmm4 \n" \
2343 stereo("addps %%xmm1, %%xmm0 \n") \
2344 "addps %%xmm1, %%xmm2 \n" \
2345 "addps %%xmm3, %%xmm0 \n" \
2346 "addps %%xmm4, %%xmm2 \n" \
2347 mono("addps %%xmm2, %%xmm0 \n") \
2348 "movaps %%xmm0, (%0, %1) \n" \
2349 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2353 : "r"(samples[0] + len), "r"(matrix) \
2354 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2355 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2359 #define MIX_MISC(stereo) \
2360 __asm__ volatile ( \
2362 "movaps (%3, %0), %%xmm0 \n" \
2363 stereo("movaps %%xmm0, %%xmm1 \n") \
2364 "mulps %%xmm4, %%xmm0 \n" \
2365 stereo("mulps %%xmm5, %%xmm1 \n") \
2366 "lea 1024(%3, %0), %1 \n" \
2369 "movaps (%1), %%xmm2 \n" \
2370 stereo("movaps %%xmm2, %%xmm3 \n") \
2371 "mulps (%4, %2), %%xmm2 \n" \
2372 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2373 "addps %%xmm2, %%xmm0 \n" \
2374 stereo("addps %%xmm3, %%xmm1 \n") \
2375 "add $1024, %1 \n" \
2378 "movaps %%xmm0, (%3, %0) \n" \
2379 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2382 : "+&r"(i), "=&r"(j), "=&r"(k) \
2383 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2384 "g"((intptr_t) - 32 * (in_ch - 1)) \
2388 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2389 int out_ch, int in_ch, int len)
2391 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2394 i = -len * sizeof(float);
2395 if (in_ch == 5 && out_ch == 2 &&
2396 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2397 matrix_cmp[3][1] | matrix_cmp[4][0] |
2398 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2399 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2401 } else if (in_ch == 5 && out_ch == 1 &&
2402 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2403 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2406 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2407 j = 2 * in_ch * sizeof(float);
2411 "movss (%2, %0), %%xmm4 \n"
2412 "movss 4(%2, %0), %%xmm5 \n"
2413 "shufps $0, %%xmm4, %%xmm4 \n"
2414 "shufps $0, %%xmm5, %%xmm5 \n"
2415 "movaps %%xmm4, (%1, %0, 4) \n"
2416 "movaps %%xmm5, 16(%1, %0, 4) \n"
2419 : "r"(matrix_simd), "r"(matrix)
2431 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2432 const float *src1, const float *win,
2435 x86_reg i = -len * 4;
2436 x86_reg j = len * 4 - 8;
2439 "pswapd (%5, %1), %%mm1 \n"
2440 "movq (%5, %0), %%mm0 \n"
2441 "pswapd (%4, %1), %%mm5 \n"
2442 "movq (%3, %0), %%mm4 \n"
2443 "movq %%mm0, %%mm2 \n"
2444 "movq %%mm1, %%mm3 \n"
2445 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2446 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2447 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2448 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2449 "pfadd %%mm3, %%mm2 \n"
2450 "pfsub %%mm0, %%mm1 \n"
2451 "pswapd %%mm2, %%mm2 \n"
2452 "movq %%mm1, (%2, %0) \n"
2453 "movq %%mm2, (%2, %1) \n"
2459 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2463 static void vector_fmul_window_sse(float *dst, const float *src0,
2464 const float *src1, const float *win, int len)
2466 x86_reg i = -len * 4;
2467 x86_reg j = len * 4 - 16;
2470 "movaps (%5, %1), %%xmm1 \n"
2471 "movaps (%5, %0), %%xmm0 \n"
2472 "movaps (%4, %1), %%xmm5 \n"
2473 "movaps (%3, %0), %%xmm4 \n"
2474 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2475 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2476 "movaps %%xmm0, %%xmm2 \n"
2477 "movaps %%xmm1, %%xmm3 \n"
2478 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2479 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2480 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2481 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2482 "addps %%xmm3, %%xmm2 \n"
2483 "subps %%xmm0, %%xmm1 \n"
2484 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2485 "movaps %%xmm1, (%2, %0) \n"
2486 "movaps %%xmm2, (%2, %1) \n"
2491 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2494 #endif /* HAVE_6REGS */
2496 static void vector_clipf_sse(float *dst, const float *src,
2497 float min, float max, int len)
2499 x86_reg i = (len - 16) * 4;
2501 "movss %3, %%xmm4 \n\t"
2502 "movss %4, %%xmm5 \n\t"
2503 "shufps $0, %%xmm4, %%xmm4 \n\t"
2504 "shufps $0, %%xmm5, %%xmm5 \n\t"
2506 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2507 "movaps 16(%2, %0), %%xmm1 \n\t"
2508 "movaps 32(%2, %0), %%xmm2 \n\t"
2509 "movaps 48(%2, %0), %%xmm3 \n\t"
2510 "maxps %%xmm4, %%xmm0 \n\t"
2511 "maxps %%xmm4, %%xmm1 \n\t"
2512 "maxps %%xmm4, %%xmm2 \n\t"
2513 "maxps %%xmm4, %%xmm3 \n\t"
2514 "minps %%xmm5, %%xmm0 \n\t"
2515 "minps %%xmm5, %%xmm1 \n\t"
2516 "minps %%xmm5, %%xmm2 \n\t"
2517 "minps %%xmm5, %%xmm3 \n\t"
2518 "movaps %%xmm0, (%1, %0) \n\t"
2519 "movaps %%xmm1, 16(%1, %0) \n\t"
2520 "movaps %%xmm2, 32(%1, %0) \n\t"
2521 "movaps %%xmm3, 48(%1, %0) \n\t"
2525 : "r"(dst), "r"(src), "m"(min), "m"(max)
2530 void ff_vp3_idct_mmx(int16_t *input_data);
2531 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2532 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2534 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size,
2535 const DCTELEM *block);
2537 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2538 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2540 void ff_vp3_idct_sse2(int16_t *input_data);
2541 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2542 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2544 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2546 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2548 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2550 int order, int mul);
2551 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2553 int order, int mul);
2554 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2556 int order, int mul);
2558 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2559 const int16_t *window, unsigned int len);
2560 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2561 const int16_t *window, unsigned int len);
2562 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2563 const int16_t *window, unsigned int len);
2564 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2565 const int16_t *window, unsigned int len);
2566 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2567 const int16_t *window, unsigned int len);
2568 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2569 const int16_t *window, unsigned int len);
2571 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2572 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2574 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2575 const uint8_t *diff, int w,
2576 int *left, int *left_top);
2577 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2579 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2582 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2584 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2585 const float *src1, int len);
2586 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2587 const float *src1, int len);
2589 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2590 const float *src2, int len);
2591 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2592 const float *src2, int len);
2594 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2595 int32_t min, int32_t max, unsigned int len);
2596 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2597 int32_t min, int32_t max, unsigned int len);
2598 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2599 int32_t min, int32_t max, unsigned int len);
2600 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2601 int32_t min, int32_t max, unsigned int len);
2603 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2604 const float *src1, int len);
2605 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2606 const float *src1, int len);
2608 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2610 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2611 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2612 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2613 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2614 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2615 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2616 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2617 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2618 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2619 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2620 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2621 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2622 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2623 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2624 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2625 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2628 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2630 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2631 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2632 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2633 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2636 #define H264_QPEL_FUNCS(x, y, CPU) \
2638 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2639 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2640 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2641 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2644 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2646 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2647 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2648 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2649 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2652 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2654 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2656 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2657 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2658 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2660 if (!high_bit_depth) {
2661 c->clear_block = clear_block_mmx;
2662 c->clear_blocks = clear_blocks_mmx;
2663 c->draw_edges = draw_edges_mmx;
2665 SET_HPEL_FUNCS(put, 0, 16, mmx);
2666 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2667 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2668 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2669 SET_HPEL_FUNCS(put, 1, 8, mmx);
2670 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2671 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2672 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2675 #if ARCH_X86_32 || !HAVE_YASM
2678 #if ARCH_X86_32 && HAVE_YASM
2679 if (!high_bit_depth)
2680 c->emulated_edge_mc = emulated_edge_mc_mmx;
2683 c->add_bytes = add_bytes_mmx;
2685 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2686 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2688 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2689 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2690 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2694 if (!high_bit_depth && CONFIG_H264CHROMA) {
2695 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2696 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2699 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2704 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2707 const int bit_depth = avctx->bits_per_raw_sample;
2708 const int high_bit_depth = bit_depth > 8;
2710 c->prefetch = prefetch_mmx2;
2712 if (!high_bit_depth) {
2713 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2714 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2716 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2717 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2718 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2720 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2721 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2723 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2724 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2725 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2728 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2729 if (!high_bit_depth) {
2730 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2731 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2732 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2733 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2735 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2736 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2739 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2740 c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
2741 c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2744 if (CONFIG_VP3_DECODER && HAVE_YASM)
2745 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2747 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2748 avctx->codec_id == CODEC_ID_THEORA)) {
2749 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2750 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2753 if (CONFIG_H264QPEL) {
2754 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2755 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2756 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2757 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2758 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2759 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2761 if (!high_bit_depth) {
2762 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2763 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2764 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2765 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2766 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2767 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2768 } else if (bit_depth == 10) {
2771 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2772 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2773 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2774 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2776 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2777 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2781 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2782 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2783 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2784 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2788 if (!high_bit_depth && CONFIG_H264CHROMA) {
2789 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2790 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2791 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2792 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2794 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2795 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2796 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2797 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2798 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2801 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2803 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2804 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2806 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2807 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2809 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2814 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2817 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2819 c->prefetch = prefetch_3dnow;
2821 if (!high_bit_depth) {
2822 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2823 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2825 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2826 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2827 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2829 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2830 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2832 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2833 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2834 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2836 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2837 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2838 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2839 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2840 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2842 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2843 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2847 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2848 avctx->codec_id == CODEC_ID_THEORA)) {
2849 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2850 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2853 if (CONFIG_H264QPEL) {
2854 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2855 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2856 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2857 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2858 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2859 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2861 if (!high_bit_depth) {
2862 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2863 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2864 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2865 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2866 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2867 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2870 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2871 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2872 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2873 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2877 if (!high_bit_depth && CONFIG_H264CHROMA) {
2878 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2879 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2883 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2886 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2890 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2894 c->vector_fmul_window = vector_fmul_window_3dnow2;
2898 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2900 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2902 if (!high_bit_depth) {
2903 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2904 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2905 c->clear_block = clear_block_sse;
2906 c->clear_blocks = clear_blocks_sse;
2910 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2911 c->ac3_downmix = ac3_downmix_sse;
2913 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2914 c->vector_fmul_add = ff_vector_fmul_add_sse;
2918 c->vector_fmul_window = vector_fmul_window_sse;
2921 c->vector_clipf = vector_clipf_sse;
2924 c->scalarproduct_float = ff_scalarproduct_float_sse;
2925 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2927 if (!high_bit_depth)
2928 c->emulated_edge_mc = emulated_edge_mc_sse;
2933 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2936 const int bit_depth = avctx->bits_per_raw_sample;
2937 const int high_bit_depth = bit_depth > 8;
2939 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2940 // these functions are slower than mmx on AMD, but faster on Intel
2941 if (!high_bit_depth) {
2942 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2943 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2944 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2945 if (CONFIG_H264QPEL)
2946 H264_QPEL_FUNCS(0, 0, sse2);
2950 if (!high_bit_depth && CONFIG_H264QPEL) {
2951 H264_QPEL_FUNCS(0, 1, sse2);
2952 H264_QPEL_FUNCS(0, 2, sse2);
2953 H264_QPEL_FUNCS(0, 3, sse2);
2954 H264_QPEL_FUNCS(1, 1, sse2);
2955 H264_QPEL_FUNCS(1, 2, sse2);
2956 H264_QPEL_FUNCS(1, 3, sse2);
2957 H264_QPEL_FUNCS(2, 1, sse2);
2958 H264_QPEL_FUNCS(2, 2, sse2);
2959 H264_QPEL_FUNCS(2, 3, sse2);
2960 H264_QPEL_FUNCS(3, 1, sse2);
2961 H264_QPEL_FUNCS(3, 2, sse2);
2962 H264_QPEL_FUNCS(3, 3, sse2);
2966 if (bit_depth == 10) {
2967 if (CONFIG_H264QPEL) {
2968 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2969 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2970 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2971 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2972 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2973 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2974 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2976 if (CONFIG_H264CHROMA) {
2977 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2978 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2982 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2983 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2984 if (mm_flags & AV_CPU_FLAG_ATOM) {
2985 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2987 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2989 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2990 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2991 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2992 c->apply_window_int16 = ff_apply_window_int16_sse2;
2994 c->bswap_buf = ff_bswap32_buf_sse2;
2998 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
3002 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
3003 const int bit_depth = avctx->bits_per_raw_sample;
3005 if (!high_bit_depth && CONFIG_H264QPEL) {
3006 H264_QPEL_FUNCS(1, 0, ssse3);
3007 H264_QPEL_FUNCS(1, 1, ssse3);
3008 H264_QPEL_FUNCS(1, 2, ssse3);
3009 H264_QPEL_FUNCS(1, 3, ssse3);
3010 H264_QPEL_FUNCS(2, 0, ssse3);
3011 H264_QPEL_FUNCS(2, 1, ssse3);
3012 H264_QPEL_FUNCS(2, 2, ssse3);
3013 H264_QPEL_FUNCS(2, 3, ssse3);
3014 H264_QPEL_FUNCS(3, 0, ssse3);
3015 H264_QPEL_FUNCS(3, 1, ssse3);
3016 H264_QPEL_FUNCS(3, 2, ssse3);
3017 H264_QPEL_FUNCS(3, 3, ssse3);
3020 else if (bit_depth == 10 && CONFIG_H264QPEL) {
3021 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
3022 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
3023 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
3025 if (!high_bit_depth && CONFIG_H264CHROMA) {
3026 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
3027 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
3028 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
3029 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
3031 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
3032 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
3033 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
3035 if (mm_flags & AV_CPU_FLAG_ATOM)
3036 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
3038 c->apply_window_int16 = ff_apply_window_int16_ssse3;
3039 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
3040 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
3041 c->bswap_buf = ff_bswap32_buf_ssse3;
3046 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
3050 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
3054 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3056 #if HAVE_AVX && HAVE_YASM
3057 const int bit_depth = avctx->bits_per_raw_sample;
3059 if (bit_depth == 10) {
3060 // AVX implies !cache64.
3061 // TODO: Port cache(32|64) detection from x264.
3062 if (CONFIG_H264QPEL) {
3063 H264_QPEL_FUNCS_10(1, 0, sse2);
3064 H264_QPEL_FUNCS_10(2, 0, sse2);
3065 H264_QPEL_FUNCS_10(3, 0, sse2);
3068 if (CONFIG_H264CHROMA) {
3069 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3070 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3073 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3074 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
3075 c->vector_fmul_add = ff_vector_fmul_add_avx;
3079 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3081 int mm_flags = av_get_cpu_flags();
3084 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3085 if (mm_flags & AV_CPU_FLAG_MMX)
3086 av_log(avctx, AV_LOG_INFO, " mmx");
3087 if (mm_flags & AV_CPU_FLAG_MMX2)
3088 av_log(avctx, AV_LOG_INFO, " mmx2");
3089 if (mm_flags & AV_CPU_FLAG_3DNOW)
3090 av_log(avctx, AV_LOG_INFO, " 3dnow");
3091 if (mm_flags & AV_CPU_FLAG_SSE)
3092 av_log(avctx, AV_LOG_INFO, " sse");
3093 if (mm_flags & AV_CPU_FLAG_SSE2)
3094 av_log(avctx, AV_LOG_INFO, " sse2");
3095 av_log(avctx, AV_LOG_INFO, "\n");
3098 if (mm_flags & AV_CPU_FLAG_MMX) {
3099 const int idct_algo = avctx->idct_algo;
3101 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3102 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3103 c->idct_put = ff_simple_idct_put_mmx;
3104 c->idct_add = ff_simple_idct_add_mmx;
3105 c->idct = ff_simple_idct_mmx;
3106 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3108 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3109 if (mm_flags & AV_CPU_FLAG_MMX2) {
3110 c->idct_put = ff_libmpeg2mmx2_idct_put;
3111 c->idct_add = ff_libmpeg2mmx2_idct_add;
3112 c->idct = ff_mmxext_idct;
3114 c->idct_put = ff_libmpeg2mmx_idct_put;
3115 c->idct_add = ff_libmpeg2mmx_idct_add;
3116 c->idct = ff_mmx_idct;
3118 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3120 } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
3121 CONFIG_VP6_DECODER) &&
3122 idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
3123 if (mm_flags & AV_CPU_FLAG_SSE2) {
3124 c->idct_put = ff_vp3_idct_put_sse2;
3125 c->idct_add = ff_vp3_idct_add_sse2;
3126 c->idct = ff_vp3_idct_sse2;
3127 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3129 c->idct_put = ff_vp3_idct_put_mmx;
3130 c->idct_add = ff_vp3_idct_add_mmx;
3131 c->idct = ff_vp3_idct_mmx;
3132 c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
3134 } else if (idct_algo == FF_IDCT_CAVS) {
3135 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3136 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3137 if (mm_flags & AV_CPU_FLAG_SSE2) {
3138 c->idct_put = ff_idct_xvid_sse2_put;
3139 c->idct_add = ff_idct_xvid_sse2_add;
3140 c->idct = ff_idct_xvid_sse2;
3141 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3142 } else if (mm_flags & AV_CPU_FLAG_MMX2) {
3143 c->idct_put = ff_idct_xvid_mmx2_put;
3144 c->idct_add = ff_idct_xvid_mmx2_add;
3145 c->idct = ff_idct_xvid_mmx2;
3147 c->idct_put = ff_idct_xvid_mmx_put;
3148 c->idct_add = ff_idct_xvid_mmx_add;
3149 c->idct = ff_idct_xvid_mmx;
3154 dsputil_init_mmx(c, avctx, mm_flags);
3157 if (mm_flags & AV_CPU_FLAG_MMX2)
3158 dsputil_init_mmx2(c, avctx, mm_flags);
3160 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3161 dsputil_init_3dnow(c, avctx, mm_flags);
3163 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3164 dsputil_init_3dnow2(c, avctx, mm_flags);
3166 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3167 dsputil_init_sse(c, avctx, mm_flags);
3169 if (mm_flags & AV_CPU_FLAG_SSE2)
3170 dsputil_init_sse2(c, avctx, mm_flags);
3172 if (mm_flags & AV_CPU_FLAG_SSSE3)
3173 dsputil_init_ssse3(c, avctx, mm_flags);
3175 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3176 dsputil_init_sse4(c, avctx, mm_flags);
3178 if (mm_flags & AV_CPU_FLAG_AVX)
3179 dsputil_init_avx(c, avctx, mm_flags);
3181 if (CONFIG_ENCODERS)
3182 ff_dsputilenc_init_mmx(c, avctx);