2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL};
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
87 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
88 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
90 #define MOVQ_BFE(regd) \
92 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
93 "paddb %%" #regd ", %%" #regd " \n\t" ::)
96 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
97 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
99 // for shared library it's better to use this way for accessing constants
101 #define MOVQ_BONE(regd) \
103 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
104 "psrlw $15, %%" #regd " \n\t" \
105 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
107 #define MOVQ_WTWO(regd) \
109 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
110 "psrlw $15, %%" #regd " \n\t" \
111 "psllw $1, %%" #regd " \n\t"::)
115 // using regr as temporary and for the output result
116 // first argument is unmodifed and second is trashed
117 // regfe is supposed to contain 0xfefefefefefefefe
118 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
119 "movq " #rega ", " #regr " \n\t"\
120 "pand " #regb ", " #regr " \n\t"\
121 "pxor " #rega ", " #regb " \n\t"\
122 "pand " #regfe "," #regb " \n\t"\
123 "psrlq $1, " #regb " \n\t"\
124 "paddb " #regb ", " #regr " \n\t"
126 #define PAVGB_MMX(rega, regb, regr, regfe) \
127 "movq " #rega ", " #regr " \n\t"\
128 "por " #regb ", " #regr " \n\t"\
129 "pxor " #rega ", " #regb " \n\t"\
130 "pand " #regfe "," #regb " \n\t"\
131 "psrlq $1, " #regb " \n\t"\
132 "psubb " #regb ", " #regr " \n\t"
134 // mm6 is supposed to contain 0xfefefefefefefefe
135 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
136 "movq " #rega ", " #regr " \n\t"\
137 "movq " #regc ", " #regp " \n\t"\
138 "pand " #regb ", " #regr " \n\t"\
139 "pand " #regd ", " #regp " \n\t"\
140 "pxor " #rega ", " #regb " \n\t"\
141 "pxor " #regc ", " #regd " \n\t"\
142 "pand %%mm6, " #regb " \n\t"\
143 "pand %%mm6, " #regd " \n\t"\
144 "psrlq $1, " #regb " \n\t"\
145 "psrlq $1, " #regd " \n\t"\
146 "paddb " #regb ", " #regr " \n\t"\
147 "paddb " #regd ", " #regp " \n\t"
149 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
150 "movq " #rega ", " #regr " \n\t"\
151 "movq " #regc ", " #regp " \n\t"\
152 "por " #regb ", " #regr " \n\t"\
153 "por " #regd ", " #regp " \n\t"\
154 "pxor " #rega ", " #regb " \n\t"\
155 "pxor " #regc ", " #regd " \n\t"\
156 "pand %%mm6, " #regb " \n\t"\
157 "pand %%mm6, " #regd " \n\t"\
158 "psrlq $1, " #regd " \n\t"\
159 "psrlq $1, " #regb " \n\t"\
160 "psubb " #regb ", " #regr " \n\t"\
161 "psubb " #regd ", " #regp " \n\t"
163 /***********************************/
164 /* MMX no rounding */
165 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
171 #include "dsputil_mmx_rnd_template.c"
177 /***********************************/
180 #define DEF(x, y) x ## _ ## y ##_mmx
181 #define SET_RND MOVQ_WTWO
182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
185 #include "dsputil_mmx_rnd_template.c"
193 /***********************************/
196 #define DEF(x) x ## _3dnow
197 #define PAVGB "pavgusb"
200 #include "dsputil_mmx_avg_template.c"
206 /***********************************/
209 #define DEF(x) x ## _mmx2
211 /* Introduced only in MMX2 set */
212 #define PAVGB "pavgb"
215 #include "dsputil_mmx_avg_template.c"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmx2 put_pixels16_mmx
224 #define put_pixels8_mmx2 put_pixels8_mmx
225 #define put_pixels4_mmx2 put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
228 #define put_pixels16_3dnow put_pixels16_mmx
229 #define put_pixels8_3dnow put_pixels8_mmx
230 #define put_pixels4_3dnow put_pixels4_mmx
231 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
232 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
234 /***********************************/
237 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
242 /* read the pixels */
247 "movq %3, %%mm0 \n\t"
248 "movq 8%3, %%mm1 \n\t"
249 "movq 16%3, %%mm2 \n\t"
250 "movq 24%3, %%mm3 \n\t"
251 "movq 32%3, %%mm4 \n\t"
252 "movq 40%3, %%mm5 \n\t"
253 "movq 48%3, %%mm6 \n\t"
254 "movq 56%3, %%mm7 \n\t"
255 "packuswb %%mm1, %%mm0 \n\t"
256 "packuswb %%mm3, %%mm2 \n\t"
257 "packuswb %%mm5, %%mm4 \n\t"
258 "packuswb %%mm7, %%mm6 \n\t"
259 "movq %%mm0, (%0) \n\t"
260 "movq %%mm2, (%0, %1) \n\t"
261 "movq %%mm4, (%0, %1, 2) \n\t"
262 "movq %%mm6, (%0, %2) \n\t"
263 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
268 // if here would be an exact copy of the code above
269 // compiler would generate some very strange code
272 "movq (%3), %%mm0 \n\t"
273 "movq 8(%3), %%mm1 \n\t"
274 "movq 16(%3), %%mm2 \n\t"
275 "movq 24(%3), %%mm3 \n\t"
276 "movq 32(%3), %%mm4 \n\t"
277 "movq 40(%3), %%mm5 \n\t"
278 "movq 48(%3), %%mm6 \n\t"
279 "movq 56(%3), %%mm7 \n\t"
280 "packuswb %%mm1, %%mm0 \n\t"
281 "packuswb %%mm3, %%mm2 \n\t"
282 "packuswb %%mm5, %%mm4 \n\t"
283 "packuswb %%mm7, %%mm6 \n\t"
284 "movq %%mm0, (%0) \n\t"
285 "movq %%mm2, (%0, %1) \n\t"
286 "movq %%mm4, (%0, %1, 2) \n\t"
287 "movq %%mm6, (%0, %2) \n\t"
288 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
292 #define put_signed_pixels_clamped_mmx_half(off) \
293 "movq "#off"(%2), %%mm1 \n\t"\
294 "movq 16+"#off"(%2), %%mm2 \n\t"\
295 "movq 32+"#off"(%2), %%mm3 \n\t"\
296 "movq 48+"#off"(%2), %%mm4 \n\t"\
297 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
298 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
299 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
300 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
301 "paddb %%mm0, %%mm1 \n\t"\
302 "paddb %%mm0, %%mm2 \n\t"\
303 "paddb %%mm0, %%mm3 \n\t"\
304 "paddb %%mm0, %%mm4 \n\t"\
305 "movq %%mm1, (%0) \n\t"\
306 "movq %%mm2, (%0, %3) \n\t"\
307 "movq %%mm3, (%0, %3, 2) \n\t"\
308 "movq %%mm4, (%0, %1) \n\t"
310 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
312 x86_reg line_skip = line_size;
316 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
317 "lea (%3, %3, 2), %1 \n\t"
318 put_signed_pixels_clamped_mmx_half(0)
319 "lea (%0, %3, 4), %0 \n\t"
320 put_signed_pixels_clamped_mmx_half(64)
321 :"+&r" (pixels), "=&r" (line_skip3)
322 :"r" (block), "r"(line_skip)
326 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
332 /* read the pixels */
339 "movq (%2), %%mm0 \n\t"
340 "movq 8(%2), %%mm1 \n\t"
341 "movq 16(%2), %%mm2 \n\t"
342 "movq 24(%2), %%mm3 \n\t"
343 "movq %0, %%mm4 \n\t"
344 "movq %1, %%mm6 \n\t"
345 "movq %%mm4, %%mm5 \n\t"
346 "punpcklbw %%mm7, %%mm4 \n\t"
347 "punpckhbw %%mm7, %%mm5 \n\t"
348 "paddsw %%mm4, %%mm0 \n\t"
349 "paddsw %%mm5, %%mm1 \n\t"
350 "movq %%mm6, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm6 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm6, %%mm2 \n\t"
354 "paddsw %%mm5, %%mm3 \n\t"
355 "packuswb %%mm1, %%mm0 \n\t"
356 "packuswb %%mm3, %%mm2 \n\t"
357 "movq %%mm0, %0 \n\t"
358 "movq %%mm2, %1 \n\t"
359 :"+m"(*pix), "+m"(*(pix+line_size))
367 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
370 "lea (%3, %3), %%"REG_a" \n\t"
373 "movd (%1), %%mm0 \n\t"
374 "movd (%1, %3), %%mm1 \n\t"
375 "movd %%mm0, (%2) \n\t"
376 "movd %%mm1, (%2, %3) \n\t"
377 "add %%"REG_a", %1 \n\t"
378 "add %%"REG_a", %2 \n\t"
379 "movd (%1), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
387 : "+g"(h), "+r" (pixels), "+r" (block)
388 : "r"((x86_reg)line_size)
393 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396 "lea (%3, %3), %%"REG_a" \n\t"
399 "movq (%1), %%mm0 \n\t"
400 "movq (%1, %3), %%mm1 \n\t"
401 "movq %%mm0, (%2) \n\t"
402 "movq %%mm1, (%2, %3) \n\t"
403 "add %%"REG_a", %1 \n\t"
404 "add %%"REG_a", %2 \n\t"
405 "movq (%1), %%mm0 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a", %1 \n\t"
410 "add %%"REG_a", %2 \n\t"
413 : "+g"(h), "+r" (pixels), "+r" (block)
414 : "r"((x86_reg)line_size)
419 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422 "lea (%3, %3), %%"REG_a" \n\t"
425 "movq (%1), %%mm0 \n\t"
426 "movq 8(%1), %%mm4 \n\t"
427 "movq (%1, %3), %%mm1 \n\t"
428 "movq 8(%1, %3), %%mm5 \n\t"
429 "movq %%mm0, (%2) \n\t"
430 "movq %%mm4, 8(%2) \n\t"
431 "movq %%mm1, (%2, %3) \n\t"
432 "movq %%mm5, 8(%2, %3) \n\t"
433 "add %%"REG_a", %1 \n\t"
434 "add %%"REG_a", %2 \n\t"
435 "movq (%1), %%mm0 \n\t"
436 "movq 8(%1), %%mm4 \n\t"
437 "movq (%1, %3), %%mm1 \n\t"
438 "movq 8(%1, %3), %%mm5 \n\t"
439 "movq %%mm0, (%2) \n\t"
440 "movq %%mm4, 8(%2) \n\t"
441 "movq %%mm1, (%2, %3) \n\t"
442 "movq %%mm5, 8(%2, %3) \n\t"
443 "add %%"REG_a", %1 \n\t"
444 "add %%"REG_a", %2 \n\t"
447 : "+g"(h), "+r" (pixels), "+r" (block)
448 : "r"((x86_reg)line_size)
453 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
457 "movdqu (%1), %%xmm0 \n\t"
458 "movdqu (%1,%3), %%xmm1 \n\t"
459 "movdqu (%1,%3,2), %%xmm2 \n\t"
460 "movdqu (%1,%4), %%xmm3 \n\t"
461 "lea (%1,%3,4), %1 \n\t"
462 "movdqa %%xmm0, (%2) \n\t"
463 "movdqa %%xmm1, (%2,%3) \n\t"
464 "movdqa %%xmm2, (%2,%3,2) \n\t"
465 "movdqa %%xmm3, (%2,%4) \n\t"
467 "lea (%2,%3,4), %2 \n\t"
469 : "+g"(h), "+r" (pixels), "+r" (block)
470 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
475 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
479 "movdqu (%1), %%xmm0 \n\t"
480 "movdqu (%1,%3), %%xmm1 \n\t"
481 "movdqu (%1,%3,2), %%xmm2 \n\t"
482 "movdqu (%1,%4), %%xmm3 \n\t"
483 "lea (%1,%3,4), %1 \n\t"
484 "pavgb (%2), %%xmm0 \n\t"
485 "pavgb (%2,%3), %%xmm1 \n\t"
486 "pavgb (%2,%3,2), %%xmm2 \n\t"
487 "pavgb (%2,%4), %%xmm3 \n\t"
488 "movdqa %%xmm0, (%2) \n\t"
489 "movdqa %%xmm1, (%2,%3) \n\t"
490 "movdqa %%xmm2, (%2,%3,2) \n\t"
491 "movdqa %%xmm3, (%2,%4) \n\t"
493 "lea (%2,%3,4), %2 \n\t"
495 : "+g"(h), "+r" (pixels), "+r" (block)
496 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
501 #define CLEAR_BLOCKS(name,n) \
502 static void name(DCTELEM *blocks)\
505 "pxor %%mm7, %%mm7 \n\t"\
506 "mov %1, %%"REG_a" \n\t"\
508 "movq %%mm7, (%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
510 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
511 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
512 "add $32, %%"REG_a" \n\t"\
514 : : "r" (((uint8_t *)blocks)+128*n),\
519 CLEAR_BLOCKS(clear_blocks_mmx, 6)
520 CLEAR_BLOCKS(clear_block_mmx, 1)
522 static void clear_block_sse(DCTELEM *block)
525 "xorps %%xmm0, %%xmm0 \n"
526 "movaps %%xmm0, (%0) \n"
527 "movaps %%xmm0, 16(%0) \n"
528 "movaps %%xmm0, 32(%0) \n"
529 "movaps %%xmm0, 48(%0) \n"
530 "movaps %%xmm0, 64(%0) \n"
531 "movaps %%xmm0, 80(%0) \n"
532 "movaps %%xmm0, 96(%0) \n"
533 "movaps %%xmm0, 112(%0) \n"
539 static void clear_blocks_sse(DCTELEM *blocks)
542 "xorps %%xmm0, %%xmm0 \n"
543 "mov %1, %%"REG_a" \n"
545 "movaps %%xmm0, (%0, %%"REG_a") \n"
546 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
551 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
552 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
553 "add $128, %%"REG_a" \n"
555 : : "r" (((uint8_t *)blocks)+128*6),
561 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
566 "movq (%1, %0), %%mm0 \n\t"
567 "movq (%2, %0), %%mm1 \n\t"
568 "paddb %%mm0, %%mm1 \n\t"
569 "movq %%mm1, (%2, %0) \n\t"
570 "movq 8(%1, %0), %%mm0 \n\t"
571 "movq 8(%2, %0), %%mm1 \n\t"
572 "paddb %%mm0, %%mm1 \n\t"
573 "movq %%mm1, 8(%2, %0) \n\t"
579 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
582 dst[i+0] += src[i+0];
585 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
590 "movq (%2, %0), %%mm0 \n\t"
591 "movq 8(%2, %0), %%mm1 \n\t"
592 "paddb (%3, %0), %%mm0 \n\t"
593 "paddb 8(%3, %0), %%mm1 \n\t"
594 "movq %%mm0, (%1, %0) \n\t"
595 "movq %%mm1, 8(%1, %0) \n\t"
601 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
604 dst[i] = src1[i] + src2[i];
608 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
611 int l = *left & 0xff;
612 int tl = *left_top & 0xff;
617 "movzbl (%3,%4), %2 \n"
630 "add (%6,%4), %b0 \n"
631 "mov %b0, (%5,%4) \n"
634 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
635 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
642 #define H263_LOOP_FILTER \
643 "pxor %%mm7, %%mm7 \n\t"\
644 "movq %0, %%mm0 \n\t"\
645 "movq %0, %%mm1 \n\t"\
646 "movq %3, %%mm2 \n\t"\
647 "movq %3, %%mm3 \n\t"\
648 "punpcklbw %%mm7, %%mm0 \n\t"\
649 "punpckhbw %%mm7, %%mm1 \n\t"\
650 "punpcklbw %%mm7, %%mm2 \n\t"\
651 "punpckhbw %%mm7, %%mm3 \n\t"\
652 "psubw %%mm2, %%mm0 \n\t"\
653 "psubw %%mm3, %%mm1 \n\t"\
654 "movq %1, %%mm2 \n\t"\
655 "movq %1, %%mm3 \n\t"\
656 "movq %2, %%mm4 \n\t"\
657 "movq %2, %%mm5 \n\t"\
658 "punpcklbw %%mm7, %%mm2 \n\t"\
659 "punpckhbw %%mm7, %%mm3 \n\t"\
660 "punpcklbw %%mm7, %%mm4 \n\t"\
661 "punpckhbw %%mm7, %%mm5 \n\t"\
662 "psubw %%mm2, %%mm4 \n\t"\
663 "psubw %%mm3, %%mm5 \n\t"\
664 "psllw $2, %%mm4 \n\t"\
665 "psllw $2, %%mm5 \n\t"\
666 "paddw %%mm0, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm5 \n\t"\
668 "pxor %%mm6, %%mm6 \n\t"\
669 "pcmpgtw %%mm4, %%mm6 \n\t"\
670 "pcmpgtw %%mm5, %%mm7 \n\t"\
671 "pxor %%mm6, %%mm4 \n\t"\
672 "pxor %%mm7, %%mm5 \n\t"\
673 "psubw %%mm6, %%mm4 \n\t"\
674 "psubw %%mm7, %%mm5 \n\t"\
675 "psrlw $3, %%mm4 \n\t"\
676 "psrlw $3, %%mm5 \n\t"\
677 "packuswb %%mm5, %%mm4 \n\t"\
678 "packsswb %%mm7, %%mm6 \n\t"\
679 "pxor %%mm7, %%mm7 \n\t"\
680 "movd %4, %%mm2 \n\t"\
681 "punpcklbw %%mm2, %%mm2 \n\t"\
682 "punpcklbw %%mm2, %%mm2 \n\t"\
683 "punpcklbw %%mm2, %%mm2 \n\t"\
684 "psubusb %%mm4, %%mm2 \n\t"\
685 "movq %%mm2, %%mm3 \n\t"\
686 "psubusb %%mm4, %%mm3 \n\t"\
687 "psubb %%mm3, %%mm2 \n\t"\
688 "movq %1, %%mm3 \n\t"\
689 "movq %2, %%mm4 \n\t"\
690 "pxor %%mm6, %%mm3 \n\t"\
691 "pxor %%mm6, %%mm4 \n\t"\
692 "paddusb %%mm2, %%mm3 \n\t"\
693 "psubusb %%mm2, %%mm4 \n\t"\
694 "pxor %%mm6, %%mm3 \n\t"\
695 "pxor %%mm6, %%mm4 \n\t"\
696 "paddusb %%mm2, %%mm2 \n\t"\
697 "packsswb %%mm1, %%mm0 \n\t"\
698 "pcmpgtb %%mm0, %%mm7 \n\t"\
699 "pxor %%mm7, %%mm0 \n\t"\
700 "psubb %%mm7, %%mm0 \n\t"\
701 "movq %%mm0, %%mm1 \n\t"\
702 "psubusb %%mm2, %%mm0 \n\t"\
703 "psubb %%mm0, %%mm1 \n\t"\
704 "pand %5, %%mm1 \n\t"\
705 "psrlw $2, %%mm1 \n\t"\
706 "pxor %%mm7, %%mm1 \n\t"\
707 "psubb %%mm7, %%mm1 \n\t"\
708 "movq %0, %%mm5 \n\t"\
709 "movq %3, %%mm6 \n\t"\
710 "psubb %%mm1, %%mm5 \n\t"\
711 "paddb %%mm1, %%mm6 \n\t"
713 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
714 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
715 const int strength= ff_h263_loop_filter_strength[qscale];
721 "movq %%mm3, %1 \n\t"
722 "movq %%mm4, %2 \n\t"
723 "movq %%mm5, %0 \n\t"
724 "movq %%mm6, %3 \n\t"
725 : "+m" (*(uint64_t*)(src - 2*stride)),
726 "+m" (*(uint64_t*)(src - 1*stride)),
727 "+m" (*(uint64_t*)(src + 0*stride)),
728 "+m" (*(uint64_t*)(src + 1*stride))
729 : "g" (2*strength), "m"(ff_pb_FC)
734 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
735 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
736 const int strength= ff_h263_loop_filter_strength[qscale];
737 DECLARE_ALIGNED(8, uint64_t, temp)[4];
738 uint8_t *btemp= (uint8_t*)temp;
742 transpose4x4(btemp , src , 8, stride);
743 transpose4x4(btemp+4, src + 4*stride, 8, stride);
745 H263_LOOP_FILTER // 5 3 4 6
751 : "g" (2*strength), "m"(ff_pb_FC)
755 "movq %%mm5, %%mm1 \n\t"
756 "movq %%mm4, %%mm0 \n\t"
757 "punpcklbw %%mm3, %%mm5 \n\t"
758 "punpcklbw %%mm6, %%mm4 \n\t"
759 "punpckhbw %%mm3, %%mm1 \n\t"
760 "punpckhbw %%mm6, %%mm0 \n\t"
761 "movq %%mm5, %%mm3 \n\t"
762 "movq %%mm1, %%mm6 \n\t"
763 "punpcklwd %%mm4, %%mm5 \n\t"
764 "punpcklwd %%mm0, %%mm1 \n\t"
765 "punpckhwd %%mm4, %%mm3 \n\t"
766 "punpckhwd %%mm0, %%mm6 \n\t"
767 "movd %%mm5, (%0) \n\t"
768 "punpckhdq %%mm5, %%mm5 \n\t"
769 "movd %%mm5, (%0,%2) \n\t"
770 "movd %%mm3, (%0,%2,2) \n\t"
771 "punpckhdq %%mm3, %%mm3 \n\t"
772 "movd %%mm3, (%0,%3) \n\t"
773 "movd %%mm1, (%1) \n\t"
774 "punpckhdq %%mm1, %%mm1 \n\t"
775 "movd %%mm1, (%1,%2) \n\t"
776 "movd %%mm6, (%1,%2,2) \n\t"
777 "punpckhdq %%mm6, %%mm6 \n\t"
778 "movd %%mm6, (%1,%3) \n\t"
780 "r" (src + 4*stride),
781 "r" ((x86_reg) stride ),
782 "r" ((x86_reg)(3*stride))
787 /* draw the edges of width 'w' of an image of size width, height
788 this mmx version can only handle w==8 || w==16 */
789 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
791 uint8_t *ptr, *last_line;
794 last_line = buf + (height - 1) * wrap;
801 "movd (%0), %%mm0 \n\t"
802 "punpcklbw %%mm0, %%mm0 \n\t"
803 "punpcklwd %%mm0, %%mm0 \n\t"
804 "punpckldq %%mm0, %%mm0 \n\t"
805 "movq %%mm0, -8(%0) \n\t"
806 "movq -8(%0, %2), %%mm1 \n\t"
807 "punpckhbw %%mm1, %%mm1 \n\t"
808 "punpckhwd %%mm1, %%mm1 \n\t"
809 "punpckhdq %%mm1, %%mm1 \n\t"
810 "movq %%mm1, (%0, %2) \n\t"
815 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
822 "movd (%0), %%mm0 \n\t"
823 "punpcklbw %%mm0, %%mm0 \n\t"
824 "punpcklwd %%mm0, %%mm0 \n\t"
825 "punpckldq %%mm0, %%mm0 \n\t"
826 "movq %%mm0, -8(%0) \n\t"
827 "movq %%mm0, -16(%0) \n\t"
828 "movq -8(%0, %2), %%mm1 \n\t"
829 "punpckhbw %%mm1, %%mm1 \n\t"
830 "punpckhwd %%mm1, %%mm1 \n\t"
831 "punpckhdq %%mm1, %%mm1 \n\t"
832 "movq %%mm1, (%0, %2) \n\t"
833 "movq %%mm1, 8(%0, %2) \n\t"
838 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
842 /* top and bottom (and hopefully also the corners) */
843 if (sides&EDGE_TOP) {
844 for(i = 0; i < h; i += 4) {
845 ptr= buf - (i + 1) * wrap - w;
848 "movq (%1, %0), %%mm0 \n\t"
849 "movq %%mm0, (%0) \n\t"
850 "movq %%mm0, (%0, %2) \n\t"
851 "movq %%mm0, (%0, %2, 2) \n\t"
852 "movq %%mm0, (%0, %3) \n\t"
857 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
862 if (sides&EDGE_BOTTOM) {
863 for(i = 0; i < w; i += 4) {
864 ptr= last_line + (i + 1) * wrap - w;
867 "movq (%1, %0), %%mm0 \n\t"
868 "movq %%mm0, (%0) \n\t"
869 "movq %%mm0, (%0, %2) \n\t"
870 "movq %%mm0, (%0, %2, 2) \n\t"
871 "movq %%mm0, (%0, %3) \n\t"
876 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
882 #define PAETH(cpu, abs3)\
883 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
888 "pxor %%mm7, %%mm7 \n"\
889 "movd (%1,%0), %%mm0 \n"\
890 "movd (%2,%0), %%mm1 \n"\
891 "punpcklbw %%mm7, %%mm0 \n"\
892 "punpcklbw %%mm7, %%mm1 \n"\
895 "movq %%mm1, %%mm2 \n"\
896 "movd (%2,%0), %%mm1 \n"\
897 "movq %%mm2, %%mm3 \n"\
898 "punpcklbw %%mm7, %%mm1 \n"\
899 "movq %%mm2, %%mm4 \n"\
900 "psubw %%mm1, %%mm3 \n"\
901 "psubw %%mm0, %%mm4 \n"\
902 "movq %%mm3, %%mm5 \n"\
903 "paddw %%mm4, %%mm5 \n"\
905 "movq %%mm4, %%mm6 \n"\
906 "pminsw %%mm5, %%mm6 \n"\
907 "pcmpgtw %%mm6, %%mm3 \n"\
908 "pcmpgtw %%mm5, %%mm4 \n"\
909 "movq %%mm4, %%mm6 \n"\
910 "pand %%mm3, %%mm4 \n"\
911 "pandn %%mm3, %%mm6 \n"\
912 "pandn %%mm0, %%mm3 \n"\
913 "movd (%3,%0), %%mm0 \n"\
914 "pand %%mm1, %%mm6 \n"\
915 "pand %%mm4, %%mm2 \n"\
916 "punpcklbw %%mm7, %%mm0 \n"\
918 "paddw %%mm6, %%mm0 \n"\
919 "paddw %%mm2, %%mm3 \n"\
920 "paddw %%mm3, %%mm0 \n"\
921 "pand %%mm5, %%mm0 \n"\
922 "movq %%mm0, %%mm3 \n"\
923 "packuswb %%mm3, %%mm3 \n"\
924 "movd %%mm3, (%1,%0) \n"\
929 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
936 "psubw %%mm5, %%mm7 \n"\
937 "pmaxsw %%mm7, %%mm5 \n"\
938 "pxor %%mm6, %%mm6 \n"\
939 "pxor %%mm7, %%mm7 \n"\
940 "psubw %%mm3, %%mm6 \n"\
941 "psubw %%mm4, %%mm7 \n"\
942 "pmaxsw %%mm6, %%mm3 \n"\
943 "pmaxsw %%mm7, %%mm4 \n"\
944 "pxor %%mm7, %%mm7 \n"
947 "pabsw %%mm3, %%mm3 \n"\
948 "pabsw %%mm4, %%mm4 \n"\
949 "pabsw %%mm5, %%mm5 \n"
951 PAETH(mmx2, ABS3_MMX2)
953 PAETH(ssse3, ABS3_SSSE3)
956 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
957 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
958 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
959 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
960 "movq "#in7", " #m3 " \n\t" /* d */\
961 "movq "#in0", %%mm5 \n\t" /* D */\
962 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
963 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
964 "movq "#in1", %%mm5 \n\t" /* C */\
965 "movq "#in2", %%mm6 \n\t" /* B */\
966 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
967 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
968 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
969 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
970 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
971 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
972 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
973 "psraw $5, %%mm5 \n\t"\
974 "packuswb %%mm5, %%mm5 \n\t"\
975 OP(%%mm5, out, %%mm7, d)
977 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
978 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
982 "pxor %%mm7, %%mm7 \n\t"\
984 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
985 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
986 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
987 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
988 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
989 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
990 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
991 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
992 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
993 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
994 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
995 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
996 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
997 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
998 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
999 "paddw %%mm3, %%mm5 \n\t" /* b */\
1000 "paddw %%mm2, %%mm6 \n\t" /* c */\
1001 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1002 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1003 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1004 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1005 "paddw %%mm4, %%mm0 \n\t" /* a */\
1006 "paddw %%mm1, %%mm5 \n\t" /* d */\
1007 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1008 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1009 "paddw %6, %%mm6 \n\t"\
1010 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1011 "psraw $5, %%mm0 \n\t"\
1012 "movq %%mm0, %5 \n\t"\
1013 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1015 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1016 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1017 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1018 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1019 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1020 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1021 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1022 "paddw %%mm0, %%mm2 \n\t" /* b */\
1023 "paddw %%mm5, %%mm3 \n\t" /* c */\
1024 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1025 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1026 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1027 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1028 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1029 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1030 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1031 "paddw %%mm2, %%mm1 \n\t" /* a */\
1032 "paddw %%mm6, %%mm4 \n\t" /* d */\
1033 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1034 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1035 "paddw %6, %%mm1 \n\t"\
1036 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1037 "psraw $5, %%mm3 \n\t"\
1038 "movq %5, %%mm1 \n\t"\
1039 "packuswb %%mm3, %%mm1 \n\t"\
1040 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1041 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1043 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1044 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1045 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1046 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1047 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1048 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1049 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1050 "paddw %%mm1, %%mm5 \n\t" /* b */\
1051 "paddw %%mm4, %%mm0 \n\t" /* c */\
1052 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1053 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1054 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1055 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1056 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1057 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1058 "paddw %%mm3, %%mm2 \n\t" /* d */\
1059 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1060 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1061 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1062 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1063 "paddw %%mm2, %%mm6 \n\t" /* a */\
1064 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1065 "paddw %6, %%mm0 \n\t"\
1066 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1067 "psraw $5, %%mm0 \n\t"\
1068 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1070 "paddw %%mm5, %%mm3 \n\t" /* a */\
1071 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1072 "paddw %%mm4, %%mm6 \n\t" /* b */\
1073 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1074 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1075 "paddw %%mm1, %%mm4 \n\t" /* c */\
1076 "paddw %%mm2, %%mm5 \n\t" /* d */\
1077 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1078 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1079 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1080 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1081 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1082 "paddw %6, %%mm4 \n\t"\
1083 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1084 "psraw $5, %%mm4 \n\t"\
1085 "packuswb %%mm4, %%mm0 \n\t"\
1086 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1092 : "+a"(src), "+c"(dst), "+D"(h)\
1093 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1098 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1101 /* quick HACK, XXX FIXME MUST be optimized */\
1104 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1105 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1106 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1107 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1108 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1109 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1110 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1111 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1112 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1113 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1114 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1115 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1116 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1117 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1118 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1119 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1121 "movq (%0), %%mm0 \n\t"\
1122 "movq 8(%0), %%mm1 \n\t"\
1123 "paddw %2, %%mm0 \n\t"\
1124 "paddw %2, %%mm1 \n\t"\
1125 "psraw $5, %%mm0 \n\t"\
1126 "psraw $5, %%mm1 \n\t"\
1127 "packuswb %%mm1, %%mm0 \n\t"\
1128 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1129 "movq 16(%0), %%mm0 \n\t"\
1130 "movq 24(%0), %%mm1 \n\t"\
1131 "paddw %2, %%mm0 \n\t"\
1132 "paddw %2, %%mm1 \n\t"\
1133 "psraw $5, %%mm0 \n\t"\
1134 "psraw $5, %%mm1 \n\t"\
1135 "packuswb %%mm1, %%mm0 \n\t"\
1136 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1137 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1145 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1147 "pxor %%mm7, %%mm7 \n\t"\
1149 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1150 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1151 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1152 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1153 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1154 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1155 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1156 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1157 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1158 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1159 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1160 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1161 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1162 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1163 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1164 "paddw %%mm3, %%mm5 \n\t" /* b */\
1165 "paddw %%mm2, %%mm6 \n\t" /* c */\
1166 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1167 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1168 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1169 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1170 "paddw %%mm4, %%mm0 \n\t" /* a */\
1171 "paddw %%mm1, %%mm5 \n\t" /* d */\
1172 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1173 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1174 "paddw %5, %%mm6 \n\t"\
1175 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1176 "psraw $5, %%mm0 \n\t"\
1177 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1179 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1180 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1181 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1182 "paddw %%mm5, %%mm1 \n\t" /* a */\
1183 "paddw %%mm6, %%mm2 \n\t" /* b */\
1184 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1185 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1186 "paddw %%mm6, %%mm3 \n\t" /* c */\
1187 "paddw %%mm5, %%mm4 \n\t" /* d */\
1188 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1189 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1190 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1191 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1192 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1193 "paddw %5, %%mm1 \n\t"\
1194 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1195 "psraw $5, %%mm3 \n\t"\
1196 "packuswb %%mm3, %%mm0 \n\t"\
1197 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1203 : "+a"(src), "+c"(dst), "+d"(h)\
1204 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1209 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1212 /* quick HACK, XXX FIXME MUST be optimized */\
1215 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1216 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1217 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1218 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1219 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1220 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1221 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1222 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1224 "movq (%0), %%mm0 \n\t"\
1225 "movq 8(%0), %%mm1 \n\t"\
1226 "paddw %2, %%mm0 \n\t"\
1227 "paddw %2, %%mm1 \n\t"\
1228 "psraw $5, %%mm0 \n\t"\
1229 "psraw $5, %%mm1 \n\t"\
1230 "packuswb %%mm1, %%mm0 \n\t"\
1231 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1232 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1240 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1242 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1243 uint64_t temp[17*4];\
1244 uint64_t *temp_ptr= temp;\
1249 "pxor %%mm7, %%mm7 \n\t"\
1251 "movq (%0), %%mm0 \n\t"\
1252 "movq (%0), %%mm1 \n\t"\
1253 "movq 8(%0), %%mm2 \n\t"\
1254 "movq 8(%0), %%mm3 \n\t"\
1255 "punpcklbw %%mm7, %%mm0 \n\t"\
1256 "punpckhbw %%mm7, %%mm1 \n\t"\
1257 "punpcklbw %%mm7, %%mm2 \n\t"\
1258 "punpckhbw %%mm7, %%mm3 \n\t"\
1259 "movq %%mm0, (%1) \n\t"\
1260 "movq %%mm1, 17*8(%1) \n\t"\
1261 "movq %%mm2, 2*17*8(%1) \n\t"\
1262 "movq %%mm3, 3*17*8(%1) \n\t"\
1267 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1268 : "r" ((x86_reg)srcStride)\
1275 /*FIXME reorder for speed */\
1277 /*"pxor %%mm7, %%mm7 \n\t"*/\
1279 "movq (%0), %%mm0 \n\t"\
1280 "movq 8(%0), %%mm1 \n\t"\
1281 "movq 16(%0), %%mm2 \n\t"\
1282 "movq 24(%0), %%mm3 \n\t"\
1283 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1284 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1286 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1288 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1290 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1291 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1293 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1294 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1296 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1297 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1299 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1300 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1302 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1304 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1306 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1307 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1309 "add $136, %0 \n\t"\
1314 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1315 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1320 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1321 uint64_t temp[9*2];\
1322 uint64_t *temp_ptr= temp;\
1327 "pxor %%mm7, %%mm7 \n\t"\
1329 "movq (%0), %%mm0 \n\t"\
1330 "movq (%0), %%mm1 \n\t"\
1331 "punpcklbw %%mm7, %%mm0 \n\t"\
1332 "punpckhbw %%mm7, %%mm1 \n\t"\
1333 "movq %%mm0, (%1) \n\t"\
1334 "movq %%mm1, 9*8(%1) \n\t"\
1339 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1340 : "r" ((x86_reg)srcStride)\
1347 /*FIXME reorder for speed */\
1349 /*"pxor %%mm7, %%mm7 \n\t"*/\
1351 "movq (%0), %%mm0 \n\t"\
1352 "movq 8(%0), %%mm1 \n\t"\
1353 "movq 16(%0), %%mm2 \n\t"\
1354 "movq 24(%0), %%mm3 \n\t"\
1355 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1356 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1358 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1360 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1362 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1364 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1366 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1367 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1374 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1375 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1380 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1381 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1384 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1386 uint8_t * const half= (uint8_t*)temp;\
1387 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1388 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1391 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1392 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1395 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1397 uint8_t * const half= (uint8_t*)temp;\
1398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1399 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1402 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1404 uint8_t * const half= (uint8_t*)temp;\
1405 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1406 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1409 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1410 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1413 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1415 uint8_t * const half= (uint8_t*)temp;\
1416 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1417 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1419 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1420 uint64_t half[8 + 9];\
1421 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1422 uint8_t * const halfHV= ((uint8_t*)half);\
1423 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1424 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1428 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1429 uint64_t half[8 + 9];\
1430 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1431 uint8_t * const halfHV= ((uint8_t*)half);\
1432 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1433 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1434 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1435 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1437 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1438 uint64_t half[8 + 9];\
1439 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1440 uint8_t * const halfHV= ((uint8_t*)half);\
1441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1442 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1443 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1444 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1446 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1447 uint64_t half[8 + 9];\
1448 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1449 uint8_t * const halfHV= ((uint8_t*)half);\
1450 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1451 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1455 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1456 uint64_t half[8 + 9];\
1457 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1458 uint8_t * const halfHV= ((uint8_t*)half);\
1459 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1460 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1461 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1463 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1464 uint64_t half[8 + 9];\
1465 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1466 uint8_t * const halfHV= ((uint8_t*)half);\
1467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1468 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1469 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1471 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1472 uint64_t half[8 + 9];\
1473 uint8_t * const halfH= ((uint8_t*)half);\
1474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1475 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1476 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1478 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1479 uint64_t half[8 + 9];\
1480 uint8_t * const halfH= ((uint8_t*)half);\
1481 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1482 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1483 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1485 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1487 uint8_t * const halfH= ((uint8_t*)half);\
1488 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1489 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1491 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1492 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1495 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1497 uint8_t * const half= (uint8_t*)temp;\
1498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1499 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1502 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1503 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1506 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1508 uint8_t * const half= (uint8_t*)temp;\
1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1510 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1513 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1515 uint8_t * const half= (uint8_t*)temp;\
1516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1517 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1520 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1521 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1524 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1526 uint8_t * const half= (uint8_t*)temp;\
1527 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1528 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1530 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1531 uint64_t half[16*2 + 17*2];\
1532 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1533 uint8_t * const halfHV= ((uint8_t*)half);\
1534 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1535 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1536 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1537 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1539 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1540 uint64_t half[16*2 + 17*2];\
1541 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1542 uint8_t * const halfHV= ((uint8_t*)half);\
1543 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1544 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1545 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1546 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1548 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1549 uint64_t half[16*2 + 17*2];\
1550 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1551 uint8_t * const halfHV= ((uint8_t*)half);\
1552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1553 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1554 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1555 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1557 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1558 uint64_t half[16*2 + 17*2];\
1559 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1560 uint8_t * const halfHV= ((uint8_t*)half);\
1561 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1562 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1563 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1564 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1566 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1567 uint64_t half[16*2 + 17*2];\
1568 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1569 uint8_t * const halfHV= ((uint8_t*)half);\
1570 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1571 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1572 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1574 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1575 uint64_t half[16*2 + 17*2];\
1576 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1577 uint8_t * const halfHV= ((uint8_t*)half);\
1578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1579 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1580 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1582 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1583 uint64_t half[17*2];\
1584 uint8_t * const halfH= ((uint8_t*)half);\
1585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1586 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1587 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1589 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1590 uint64_t half[17*2];\
1591 uint8_t * const halfH= ((uint8_t*)half);\
1592 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1593 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1594 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1596 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1597 uint64_t half[17*2];\
1598 uint8_t * const halfH= ((uint8_t*)half);\
1599 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1600 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1603 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1604 #define AVG_3DNOW_OP(a,b,temp, size) \
1605 "mov" #size " " #b ", " #temp " \n\t"\
1606 "pavgusb " #temp ", " #a " \n\t"\
1607 "mov" #size " " #a ", " #b " \n\t"
1608 #define AVG_MMX2_OP(a,b,temp, size) \
1609 "mov" #size " " #b ", " #temp " \n\t"\
1610 "pavgb " #temp ", " #a " \n\t"\
1611 "mov" #size " " #a ", " #b " \n\t"
1613 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1614 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1615 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1616 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1617 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1618 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1619 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1620 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1621 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1623 /***********************************/
1624 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1626 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1627 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1628 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1630 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1631 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1632 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1635 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1636 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1637 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1638 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1639 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1640 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1641 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1642 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1643 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1644 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1645 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1646 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1648 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1649 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1654 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1655 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1656 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1657 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1658 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1660 QPEL_2TAP(put_, 16, mmx2)
1661 QPEL_2TAP(avg_, 16, mmx2)
1662 QPEL_2TAP(put_, 8, mmx2)
1663 QPEL_2TAP(avg_, 8, mmx2)
1664 QPEL_2TAP(put_, 16, 3dnow)
1665 QPEL_2TAP(avg_, 16, 3dnow)
1666 QPEL_2TAP(put_, 8, 3dnow)
1667 QPEL_2TAP(avg_, 8, 3dnow)
1671 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1672 x86_reg linesize, x86_reg start_y,
1673 x86_reg end_y, x86_reg block_h,
1674 x86_reg start_x, x86_reg end_x,
1676 extern emu_edge_core_func ff_emu_edge_core_mmx;
1677 extern emu_edge_core_func ff_emu_edge_core_sse;
1679 static av_always_inline
1680 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1681 int block_w, int block_h,
1682 int src_x, int src_y, int w, int h,
1683 emu_edge_core_func *core_fn)
1685 int start_y, start_x, end_y, end_x, src_y_add=0;
1688 src_y_add = h-1-src_y;
1690 }else if(src_y<=-block_h){
1691 src_y_add = 1-block_h-src_y;
1697 }else if(src_x<=-block_w){
1698 src+= (1-block_w-src_x);
1702 start_y= FFMAX(0, -src_y);
1703 start_x= FFMAX(0, -src_x);
1704 end_y= FFMIN(block_h, h-src_y);
1705 end_x= FFMIN(block_w, w-src_x);
1706 assert(start_x < end_x && block_w > 0);
1707 assert(start_y < end_y && block_h > 0);
1709 // fill in the to-be-copied part plus all above/below
1710 src += (src_y_add+start_y)*linesize + start_x;
1712 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1717 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1718 int block_w, int block_h,
1719 int src_x, int src_y, int w, int h)
1721 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1722 w, h, &ff_emu_edge_core_mmx);
1726 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1727 int block_w, int block_h,
1728 int src_x, int src_y, int w, int h)
1730 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1731 w, h, &ff_emu_edge_core_sse);
1733 #endif /* HAVE_YASM */
1735 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1736 int linesize, int block_w, int block_h,
1737 int src_x, int src_y, int w, int h);
1739 static av_always_inline
1740 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1741 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1742 emulated_edge_mc_func *emu_edge_fn)
1745 const int ix = ox>>(16+shift);
1746 const int iy = oy>>(16+shift);
1747 const int oxs = ox>>4;
1748 const int oys = oy>>4;
1749 const int dxxs = dxx>>4;
1750 const int dxys = dxy>>4;
1751 const int dyxs = dyx>>4;
1752 const int dyys = dyy>>4;
1753 const uint16_t r4[4] = {r,r,r,r};
1754 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1755 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1756 const uint64_t shift2 = 2*shift;
1757 uint8_t edge_buf[(h+1)*stride];
1760 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1761 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1762 const int dxh = dxy*(h-1);
1763 const int dyw = dyx*(w-1);
1764 if( // non-constant fullpel offset (3% of blocks)
1765 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1766 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1767 // uses more than 16 bits of subpel mv (only at huge resolution)
1768 || (dxx|dxy|dyx|dyy)&15 )
1770 //FIXME could still use mmx for some of the rows
1771 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1775 src += ix + iy*stride;
1776 if( (unsigned)ix >= width-w ||
1777 (unsigned)iy >= height-h )
1779 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1784 "movd %0, %%mm6 \n\t"
1785 "pxor %%mm7, %%mm7 \n\t"
1786 "punpcklwd %%mm6, %%mm6 \n\t"
1787 "punpcklwd %%mm6, %%mm6 \n\t"
1791 for(x=0; x<w; x+=4){
1792 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1793 oxs - dxys + dxxs*(x+1),
1794 oxs - dxys + dxxs*(x+2),
1795 oxs - dxys + dxxs*(x+3) };
1796 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1797 oys - dyys + dyxs*(x+1),
1798 oys - dyys + dyxs*(x+2),
1799 oys - dyys + dyxs*(x+3) };
1803 "movq %0, %%mm4 \n\t"
1804 "movq %1, %%mm5 \n\t"
1805 "paddw %2, %%mm4 \n\t"
1806 "paddw %3, %%mm5 \n\t"
1807 "movq %%mm4, %0 \n\t"
1808 "movq %%mm5, %1 \n\t"
1809 "psrlw $12, %%mm4 \n\t"
1810 "psrlw $12, %%mm5 \n\t"
1811 : "+m"(*dx4), "+m"(*dy4)
1812 : "m"(*dxy4), "m"(*dyy4)
1816 "movq %%mm6, %%mm2 \n\t"
1817 "movq %%mm6, %%mm1 \n\t"
1818 "psubw %%mm4, %%mm2 \n\t"
1819 "psubw %%mm5, %%mm1 \n\t"
1820 "movq %%mm2, %%mm0 \n\t"
1821 "movq %%mm4, %%mm3 \n\t"
1822 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1823 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1824 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1825 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1827 "movd %4, %%mm5 \n\t"
1828 "movd %3, %%mm4 \n\t"
1829 "punpcklbw %%mm7, %%mm5 \n\t"
1830 "punpcklbw %%mm7, %%mm4 \n\t"
1831 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1832 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1834 "movd %2, %%mm5 \n\t"
1835 "movd %1, %%mm4 \n\t"
1836 "punpcklbw %%mm7, %%mm5 \n\t"
1837 "punpcklbw %%mm7, %%mm4 \n\t"
1838 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1839 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1840 "paddw %5, %%mm1 \n\t"
1841 "paddw %%mm3, %%mm2 \n\t"
1842 "paddw %%mm1, %%mm0 \n\t"
1843 "paddw %%mm2, %%mm0 \n\t"
1845 "psrlw %6, %%mm0 \n\t"
1846 "packuswb %%mm0, %%mm0 \n\t"
1847 "movd %%mm0, %0 \n\t"
1849 : "=m"(dst[x+y*stride])
1850 : "m"(src[0]), "m"(src[1]),
1851 "m"(src[stride]), "m"(src[stride+1]),
1852 "m"(*r4), "m"(shift2)
1862 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1863 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1865 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1866 width, height, &emulated_edge_mc_mmx);
1869 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1870 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1872 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1873 width, height, &emulated_edge_mc_sse);
1876 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1877 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1879 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1880 width, height, &ff_emulated_edge_mc_8);
1884 #define PREFETCH(name, op) \
1885 static void name(void *mem, int stride, int h){\
1886 const uint8_t *p= mem;\
1888 __asm__ volatile(#op" %0" :: "m"(*p));\
1892 PREFETCH(prefetch_mmx2, prefetcht0)
1893 PREFETCH(prefetch_3dnow, prefetch)
1896 #include "h264_qpel_mmx.c"
1898 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1899 int stride, int h, int x, int y);
1900 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1901 int stride, int h, int x, int y);
1902 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1903 int stride, int h, int x, int y);
1905 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1906 int stride, int h, int x, int y);
1907 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1908 int stride, int h, int x, int y);
1909 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1910 int stride, int h, int x, int y);
1912 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1913 int stride, int h, int x, int y);
1914 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1915 int stride, int h, int x, int y);
1917 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1918 int stride, int h, int x, int y);
1919 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1920 int stride, int h, int x, int y);
1922 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1923 int stride, int h, int x, int y);
1924 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1925 int stride, int h, int x, int y);
1927 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1928 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1929 (uint8_t *dst, uint8_t *src,\
1930 int stride, int h, int x, int y);
1932 CHROMA_MC(put, 2, 10, mmxext)
1933 CHROMA_MC(avg, 2, 10, mmxext)
1934 CHROMA_MC(put, 4, 10, mmxext)
1935 CHROMA_MC(avg, 4, 10, mmxext)
1936 CHROMA_MC(put, 8, 10, sse2)
1937 CHROMA_MC(avg, 8, 10, sse2)
1938 CHROMA_MC(put, 8, 10, avx)
1939 CHROMA_MC(avg, 8, 10, avx)
1942 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1943 put_pixels8_mmx(dst, src, stride, 8);
1945 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1946 avg_pixels8_mmx(dst, src, stride, 8);
1948 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1949 put_pixels16_mmx(dst, src, stride, 16);
1951 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1952 avg_pixels16_mmx(dst, src, stride, 16);
1956 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1957 put_pixels8_mmx(dst, src, stride, 8);
1959 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1960 avg_pixels8_mmx2(dst, src, stride, 8);
1963 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1966 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1968 ff_mmx_idct (block);
1969 ff_put_pixels_clamped_mmx(block, dest, line_size);
1971 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1973 ff_mmx_idct (block);
1974 ff_add_pixels_clamped_mmx(block, dest, line_size);
1976 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1978 ff_mmxext_idct (block);
1979 ff_put_pixels_clamped_mmx(block, dest, line_size);
1981 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1983 ff_mmxext_idct (block);
1984 ff_add_pixels_clamped_mmx(block, dest, line_size);
1987 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1989 ff_idct_xvid_mmx (block);
1990 ff_put_pixels_clamped_mmx(block, dest, line_size);
1992 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1994 ff_idct_xvid_mmx (block);
1995 ff_add_pixels_clamped_mmx(block, dest, line_size);
1997 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1999 ff_idct_xvid_mmx2 (block);
2000 ff_put_pixels_clamped_mmx(block, dest, line_size);
2002 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2004 ff_idct_xvid_mmx2 (block);
2005 ff_add_pixels_clamped_mmx(block, dest, line_size);
2008 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2011 __asm__ volatile("pxor %%mm7, %%mm7":);
2012 for(i=0; i<blocksize; i+=2) {
2014 "movq %0, %%mm0 \n\t"
2015 "movq %1, %%mm1 \n\t"
2016 "movq %%mm0, %%mm2 \n\t"
2017 "movq %%mm1, %%mm3 \n\t"
2018 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2019 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2020 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2021 "pxor %%mm2, %%mm1 \n\t"
2022 "movq %%mm3, %%mm4 \n\t"
2023 "pand %%mm1, %%mm3 \n\t"
2024 "pandn %%mm1, %%mm4 \n\t"
2025 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2026 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2027 "movq %%mm3, %1 \n\t"
2028 "movq %%mm0, %0 \n\t"
2029 :"+m"(mag[i]), "+m"(ang[i])
2033 __asm__ volatile("femms");
2035 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2040 "movaps %0, %%xmm5 \n\t"
2041 ::"m"(ff_pdw_80000000[0])
2043 for(i=0; i<blocksize; i+=4) {
2045 "movaps %0, %%xmm0 \n\t"
2046 "movaps %1, %%xmm1 \n\t"
2047 "xorps %%xmm2, %%xmm2 \n\t"
2048 "xorps %%xmm3, %%xmm3 \n\t"
2049 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2050 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2051 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2052 "xorps %%xmm2, %%xmm1 \n\t"
2053 "movaps %%xmm3, %%xmm4 \n\t"
2054 "andps %%xmm1, %%xmm3 \n\t"
2055 "andnps %%xmm1, %%xmm4 \n\t"
2056 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2057 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2058 "movaps %%xmm3, %1 \n\t"
2059 "movaps %%xmm0, %0 \n\t"
2060 :"+m"(mag[i]), "+m"(ang[i])
2069 #define MIX5(mono,stereo)\
2071 "movss 0(%2), %%xmm5 \n"\
2072 "movss 8(%2), %%xmm6 \n"\
2073 "movss 24(%2), %%xmm7 \n"\
2074 "shufps $0, %%xmm5, %%xmm5 \n"\
2075 "shufps $0, %%xmm6, %%xmm6 \n"\
2076 "shufps $0, %%xmm7, %%xmm7 \n"\
2078 "movaps (%0,%1), %%xmm0 \n"\
2079 "movaps 0x400(%0,%1), %%xmm1 \n"\
2080 "movaps 0x800(%0,%1), %%xmm2 \n"\
2081 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2082 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2083 "mulps %%xmm5, %%xmm0 \n"\
2084 "mulps %%xmm6, %%xmm1 \n"\
2085 "mulps %%xmm5, %%xmm2 \n"\
2086 "mulps %%xmm7, %%xmm3 \n"\
2087 "mulps %%xmm7, %%xmm4 \n"\
2088 stereo("addps %%xmm1, %%xmm0 \n")\
2089 "addps %%xmm1, %%xmm2 \n"\
2090 "addps %%xmm3, %%xmm0 \n"\
2091 "addps %%xmm4, %%xmm2 \n"\
2092 mono("addps %%xmm2, %%xmm0 \n")\
2093 "movaps %%xmm0, (%0,%1) \n"\
2094 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2098 :"r"(samples[0]+len), "r"(matrix)\
2099 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2100 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2104 #define MIX_MISC(stereo)\
2107 "movaps (%3,%0), %%xmm0 \n"\
2108 stereo("movaps %%xmm0, %%xmm1 \n")\
2109 "mulps %%xmm4, %%xmm0 \n"\
2110 stereo("mulps %%xmm5, %%xmm1 \n")\
2111 "lea 1024(%3,%0), %1 \n"\
2114 "movaps (%1), %%xmm2 \n"\
2115 stereo("movaps %%xmm2, %%xmm3 \n")\
2116 "mulps (%4,%2), %%xmm2 \n"\
2117 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2118 "addps %%xmm2, %%xmm0 \n"\
2119 stereo("addps %%xmm3, %%xmm1 \n")\
2123 "movaps %%xmm0, (%3,%0) \n"\
2124 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2127 :"+&r"(i), "=&r"(j), "=&r"(k)\
2128 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2132 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2134 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2137 i = -len*sizeof(float);
2138 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2140 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2143 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2144 j = 2*in_ch*sizeof(float);
2148 "movss (%2,%0), %%xmm4 \n"
2149 "movss 4(%2,%0), %%xmm5 \n"
2150 "shufps $0, %%xmm4, %%xmm4 \n"
2151 "shufps $0, %%xmm5, %%xmm5 \n"
2152 "movaps %%xmm4, (%1,%0,4) \n"
2153 "movaps %%xmm5, 16(%1,%0,4) \n"
2156 :"r"(matrix_simd), "r"(matrix)
2167 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2168 x86_reg i = (len-4)*4;
2171 "movq (%2,%0), %%mm0 \n\t"
2172 "movq 8(%2,%0), %%mm1 \n\t"
2173 "pfmul (%3,%0), %%mm0 \n\t"
2174 "pfmul 8(%3,%0), %%mm1 \n\t"
2175 "movq %%mm0, (%1,%0) \n\t"
2176 "movq %%mm1, 8(%1,%0) \n\t"
2181 :"r"(dst), "r"(src0), "r"(src1)
2185 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2186 x86_reg i = (len-8)*4;
2189 "movaps (%2,%0), %%xmm0 \n\t"
2190 "movaps 16(%2,%0), %%xmm1 \n\t"
2191 "mulps (%3,%0), %%xmm0 \n\t"
2192 "mulps 16(%3,%0), %%xmm1 \n\t"
2193 "movaps %%xmm0, (%1,%0) \n\t"
2194 "movaps %%xmm1, 16(%1,%0) \n\t"
2198 :"r"(dst), "r"(src0), "r"(src1)
2203 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2204 x86_reg i = len*4-16;
2207 "pswapd 8(%1), %%mm0 \n\t"
2208 "pswapd (%1), %%mm1 \n\t"
2209 "pfmul (%3,%0), %%mm0 \n\t"
2210 "pfmul 8(%3,%0), %%mm1 \n\t"
2211 "movq %%mm0, (%2,%0) \n\t"
2212 "movq %%mm1, 8(%2,%0) \n\t"
2216 :"+r"(i), "+r"(src1)
2217 :"r"(dst), "r"(src0)
2219 __asm__ volatile("femms");
2221 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2222 x86_reg i = len*4-32;
2225 "movaps 16(%1), %%xmm0 \n\t"
2226 "movaps (%1), %%xmm1 \n\t"
2227 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2228 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2229 "mulps (%3,%0), %%xmm0 \n\t"
2230 "mulps 16(%3,%0), %%xmm1 \n\t"
2231 "movaps %%xmm0, (%2,%0) \n\t"
2232 "movaps %%xmm1, 16(%2,%0) \n\t"
2236 :"+r"(i), "+r"(src1)
2237 :"r"(dst), "r"(src0)
2241 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2242 const float *src2, int len){
2243 x86_reg i = (len-4)*4;
2246 "movq (%2,%0), %%mm0 \n\t"
2247 "movq 8(%2,%0), %%mm1 \n\t"
2248 "pfmul (%3,%0), %%mm0 \n\t"
2249 "pfmul 8(%3,%0), %%mm1 \n\t"
2250 "pfadd (%4,%0), %%mm0 \n\t"
2251 "pfadd 8(%4,%0), %%mm1 \n\t"
2252 "movq %%mm0, (%1,%0) \n\t"
2253 "movq %%mm1, 8(%1,%0) \n\t"
2257 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2260 __asm__ volatile("femms");
2262 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2263 const float *src2, int len){
2264 x86_reg i = (len-8)*4;
2267 "movaps (%2,%0), %%xmm0 \n\t"
2268 "movaps 16(%2,%0), %%xmm1 \n\t"
2269 "mulps (%3,%0), %%xmm0 \n\t"
2270 "mulps 16(%3,%0), %%xmm1 \n\t"
2271 "addps (%4,%0), %%xmm0 \n\t"
2272 "addps 16(%4,%0), %%xmm1 \n\t"
2273 "movaps %%xmm0, (%1,%0) \n\t"
2274 "movaps %%xmm1, 16(%1,%0) \n\t"
2278 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2284 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2285 const float *win, int len){
2287 x86_reg j = len*4-8;
2290 "pswapd (%5,%1), %%mm1 \n"
2291 "movq (%5,%0), %%mm0 \n"
2292 "pswapd (%4,%1), %%mm5 \n"
2293 "movq (%3,%0), %%mm4 \n"
2294 "movq %%mm0, %%mm2 \n"
2295 "movq %%mm1, %%mm3 \n"
2296 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2297 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2298 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2299 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2300 "pfadd %%mm3, %%mm2 \n"
2301 "pfsub %%mm0, %%mm1 \n"
2302 "pswapd %%mm2, %%mm2 \n"
2303 "movq %%mm1, (%2,%0) \n"
2304 "movq %%mm2, (%2,%1) \n"
2310 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2314 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2315 const float *win, int len){
2317 x86_reg j = len*4-16;
2320 "movaps (%5,%1), %%xmm1 \n"
2321 "movaps (%5,%0), %%xmm0 \n"
2322 "movaps (%4,%1), %%xmm5 \n"
2323 "movaps (%3,%0), %%xmm4 \n"
2324 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2325 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2326 "movaps %%xmm0, %%xmm2 \n"
2327 "movaps %%xmm1, %%xmm3 \n"
2328 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2329 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2330 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2331 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2332 "addps %%xmm3, %%xmm2 \n"
2333 "subps %%xmm0, %%xmm1 \n"
2334 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2335 "movaps %%xmm1, (%2,%0) \n"
2336 "movaps %%xmm2, (%2,%1) \n"
2341 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2344 #endif /* HAVE_6REGS */
2346 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2349 x86_reg i = (len-16)*4;
2351 "movss %3, %%xmm4 \n"
2352 "movss %4, %%xmm5 \n"
2353 "shufps $0, %%xmm4, %%xmm4 \n"
2354 "shufps $0, %%xmm5, %%xmm5 \n"
2356 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2357 "movaps 16(%2,%0), %%xmm1 \n\t"
2358 "movaps 32(%2,%0), %%xmm2 \n\t"
2359 "movaps 48(%2,%0), %%xmm3 \n\t"
2360 "maxps %%xmm4, %%xmm0 \n\t"
2361 "maxps %%xmm4, %%xmm1 \n\t"
2362 "maxps %%xmm4, %%xmm2 \n\t"
2363 "maxps %%xmm4, %%xmm3 \n\t"
2364 "minps %%xmm5, %%xmm0 \n\t"
2365 "minps %%xmm5, %%xmm1 \n\t"
2366 "minps %%xmm5, %%xmm2 \n\t"
2367 "minps %%xmm5, %%xmm3 \n\t"
2368 "movaps %%xmm0, (%1,%0) \n\t"
2369 "movaps %%xmm1, 16(%1,%0) \n\t"
2370 "movaps %%xmm2, 32(%1,%0) \n\t"
2371 "movaps %%xmm3, 48(%1,%0) \n\t"
2375 :"r"(dst), "r"(src), "m"(min), "m"(max)
2380 void ff_vp3_idct_mmx(int16_t *input_data);
2381 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2382 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2384 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2386 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2387 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2389 void ff_vp3_idct_sse2(int16_t *input_data);
2390 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2391 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2393 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2394 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2395 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2396 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2397 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2399 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2400 const int16_t *window, unsigned int len);
2401 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2402 const int16_t *window, unsigned int len);
2403 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2404 const int16_t *window, unsigned int len);
2405 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2406 const int16_t *window, unsigned int len);
2407 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2408 const int16_t *window, unsigned int len);
2409 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2410 const int16_t *window, unsigned int len);
2412 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2413 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2414 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2416 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2418 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2419 int32_t max, unsigned int len);
2420 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2421 int32_t max, unsigned int len);
2422 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
2423 int32_t max, unsigned int len);
2424 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
2425 int32_t max, unsigned int len);
2427 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2428 const float *src1, int len);
2429 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2430 const float *src1, int len);
2432 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2434 int mm_flags = av_get_cpu_flags();
2435 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2436 const int bit_depth = avctx->bits_per_raw_sample;
2438 if (avctx->dsp_mask) {
2439 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2440 mm_flags |= (avctx->dsp_mask & 0xffff);
2442 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2446 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2447 if (mm_flags & AV_CPU_FLAG_MMX)
2448 av_log(avctx, AV_LOG_INFO, " mmx");
2449 if (mm_flags & AV_CPU_FLAG_MMX2)
2450 av_log(avctx, AV_LOG_INFO, " mmx2");
2451 if (mm_flags & AV_CPU_FLAG_3DNOW)
2452 av_log(avctx, AV_LOG_INFO, " 3dnow");
2453 if (mm_flags & AV_CPU_FLAG_SSE)
2454 av_log(avctx, AV_LOG_INFO, " sse");
2455 if (mm_flags & AV_CPU_FLAG_SSE2)
2456 av_log(avctx, AV_LOG_INFO, " sse2");
2457 av_log(avctx, AV_LOG_INFO, "\n");
2460 if (mm_flags & AV_CPU_FLAG_MMX) {
2461 const int idct_algo= avctx->idct_algo;
2463 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2464 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2465 c->idct_put= ff_simple_idct_put_mmx;
2466 c->idct_add= ff_simple_idct_add_mmx;
2467 c->idct = ff_simple_idct_mmx;
2468 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2470 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2471 if(mm_flags & AV_CPU_FLAG_MMX2){
2472 c->idct_put= ff_libmpeg2mmx2_idct_put;
2473 c->idct_add= ff_libmpeg2mmx2_idct_add;
2474 c->idct = ff_mmxext_idct;
2476 c->idct_put= ff_libmpeg2mmx_idct_put;
2477 c->idct_add= ff_libmpeg2mmx_idct_add;
2478 c->idct = ff_mmx_idct;
2480 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2482 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2483 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2484 if(mm_flags & AV_CPU_FLAG_SSE2){
2485 c->idct_put= ff_vp3_idct_put_sse2;
2486 c->idct_add= ff_vp3_idct_add_sse2;
2487 c->idct = ff_vp3_idct_sse2;
2488 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2490 c->idct_put= ff_vp3_idct_put_mmx;
2491 c->idct_add= ff_vp3_idct_add_mmx;
2492 c->idct = ff_vp3_idct_mmx;
2493 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2495 }else if(idct_algo==FF_IDCT_CAVS){
2496 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2497 }else if(idct_algo==FF_IDCT_XVIDMMX){
2498 if(mm_flags & AV_CPU_FLAG_SSE2){
2499 c->idct_put= ff_idct_xvid_sse2_put;
2500 c->idct_add= ff_idct_xvid_sse2_add;
2501 c->idct = ff_idct_xvid_sse2;
2502 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2503 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2504 c->idct_put= ff_idct_xvid_mmx2_put;
2505 c->idct_add= ff_idct_xvid_mmx2_add;
2506 c->idct = ff_idct_xvid_mmx2;
2508 c->idct_put= ff_idct_xvid_mmx_put;
2509 c->idct_add= ff_idct_xvid_mmx_add;
2510 c->idct = ff_idct_xvid_mmx;
2515 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2516 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2517 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2518 if (!high_bit_depth) {
2519 c->clear_block = clear_block_mmx;
2520 c->clear_blocks = clear_blocks_mmx;
2521 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2522 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2523 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2524 c->clear_block = clear_block_sse;
2525 c->clear_blocks = clear_blocks_sse;
2529 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2530 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2531 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2532 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2533 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2535 if (!high_bit_depth) {
2536 SET_HPEL_FUNCS(put, 0, 16, mmx);
2537 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2538 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2539 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2540 SET_HPEL_FUNCS(put, 1, 8, mmx);
2541 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2542 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2543 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2546 #if ARCH_X86_32 || !HAVE_YASM
2549 #if ARCH_X86_32 && HAVE_YASM
2550 if (!high_bit_depth)
2551 c->emulated_edge_mc = emulated_edge_mc_mmx;
2554 c->add_bytes= add_bytes_mmx;
2555 c->add_bytes_l2= add_bytes_l2_mmx;
2557 if (!high_bit_depth)
2558 c->draw_edges = draw_edges_mmx;
2560 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2561 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2562 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2566 if (!high_bit_depth && CONFIG_H264CHROMA) {
2567 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2568 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2571 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2574 if (mm_flags & AV_CPU_FLAG_MMX2) {
2575 c->prefetch = prefetch_mmx2;
2577 if (!high_bit_depth) {
2578 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2579 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2581 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2582 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2583 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2585 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2586 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2588 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2589 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2590 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2593 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2594 if (!high_bit_depth) {
2595 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2596 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2597 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2598 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2599 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2600 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2603 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2604 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2605 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2608 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2609 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2612 if (CONFIG_VP3_DECODER
2613 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2614 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2615 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2618 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2619 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2620 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2621 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2622 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2623 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2624 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2625 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2626 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2627 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2628 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2629 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2630 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2631 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2632 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2633 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2634 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2636 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2637 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2638 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2639 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2640 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2641 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2643 if (!high_bit_depth) {
2644 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2645 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2646 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2647 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2648 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2649 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2651 else if (bit_depth == 10) {
2654 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2655 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2656 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2657 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2659 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2660 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2664 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2665 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2666 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2667 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2670 if (!high_bit_depth && CONFIG_H264CHROMA) {
2671 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2672 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2673 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2674 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2676 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2677 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
2678 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
2679 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
2680 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
2683 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2686 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2687 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2690 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2691 } else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2692 c->prefetch = prefetch_3dnow;
2694 if (!high_bit_depth) {
2695 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2696 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2698 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2699 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2700 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2702 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2703 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2705 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2706 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2707 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2709 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2710 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2711 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2712 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2713 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2714 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2715 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2719 if (CONFIG_VP3_DECODER
2720 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2721 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2722 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2725 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2726 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2727 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2728 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2729 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2730 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2732 if (!high_bit_depth) {
2733 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2734 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2735 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2736 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2737 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2738 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2741 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2742 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2743 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2744 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2747 if (!high_bit_depth && CONFIG_H264CHROMA) {
2748 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2749 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2756 #define H264_QPEL_FUNCS(x, y, CPU)\
2757 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2758 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2759 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2760 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2761 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2762 // these functions are slower than mmx on AMD, but faster on Intel
2763 if (!high_bit_depth) {
2764 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2765 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2766 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2767 H264_QPEL_FUNCS(0, 0, sse2);
2770 if(mm_flags & AV_CPU_FLAG_SSE2){
2771 if (!high_bit_depth) {
2772 H264_QPEL_FUNCS(0, 1, sse2);
2773 H264_QPEL_FUNCS(0, 2, sse2);
2774 H264_QPEL_FUNCS(0, 3, sse2);
2775 H264_QPEL_FUNCS(1, 1, sse2);
2776 H264_QPEL_FUNCS(1, 2, sse2);
2777 H264_QPEL_FUNCS(1, 3, sse2);
2778 H264_QPEL_FUNCS(2, 1, sse2);
2779 H264_QPEL_FUNCS(2, 2, sse2);
2780 H264_QPEL_FUNCS(2, 3, sse2);
2781 H264_QPEL_FUNCS(3, 1, sse2);
2782 H264_QPEL_FUNCS(3, 2, sse2);
2783 H264_QPEL_FUNCS(3, 3, sse2);
2786 #define H264_QPEL_FUNCS_10(x, y, CPU)\
2787 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2788 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2789 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2790 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2791 if (bit_depth == 10) {
2792 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2793 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2794 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2795 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2796 H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2797 H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2798 H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2800 if (CONFIG_H264CHROMA) {
2801 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2802 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2808 if(mm_flags & AV_CPU_FLAG_SSSE3){
2809 if (!high_bit_depth) {
2810 H264_QPEL_FUNCS(1, 0, ssse3);
2811 H264_QPEL_FUNCS(1, 1, ssse3);
2812 H264_QPEL_FUNCS(1, 2, ssse3);
2813 H264_QPEL_FUNCS(1, 3, ssse3);
2814 H264_QPEL_FUNCS(2, 0, ssse3);
2815 H264_QPEL_FUNCS(2, 1, ssse3);
2816 H264_QPEL_FUNCS(2, 2, ssse3);
2817 H264_QPEL_FUNCS(2, 3, ssse3);
2818 H264_QPEL_FUNCS(3, 0, ssse3);
2819 H264_QPEL_FUNCS(3, 1, ssse3);
2820 H264_QPEL_FUNCS(3, 2, ssse3);
2821 H264_QPEL_FUNCS(3, 3, ssse3);
2824 else if (bit_depth == 10) {
2825 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
2826 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
2827 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
2830 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2832 if (!high_bit_depth && CONFIG_H264CHROMA) {
2833 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2834 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2835 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2836 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2838 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2839 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2840 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2845 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2846 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2847 c->vector_fmul = vector_fmul_3dnow;
2849 if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) {
2850 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2852 c->vector_fmul_window = vector_fmul_window_3dnow2;
2855 if(mm_flags & AV_CPU_FLAG_MMX2){
2857 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2858 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2859 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2860 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2862 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2866 if(mm_flags & AV_CPU_FLAG_SSE){
2867 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2868 c->ac3_downmix = ac3_downmix_sse;
2869 c->vector_fmul = vector_fmul_sse;
2870 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2871 c->vector_fmul_add = vector_fmul_add_sse;
2873 c->vector_fmul_window = vector_fmul_window_sse;
2875 c->vector_clipf = vector_clipf_sse;
2877 c->scalarproduct_float = ff_scalarproduct_float_sse;
2878 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2880 if (!high_bit_depth)
2881 c->emulated_edge_mc = emulated_edge_mc_sse;
2885 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2886 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2887 if(mm_flags & AV_CPU_FLAG_SSE2){
2889 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2890 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2891 if (mm_flags & AV_CPU_FLAG_ATOM) {
2892 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2894 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2896 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2897 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2899 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2900 c->apply_window_int16 = ff_apply_window_int16_sse2;
2905 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2907 if (mm_flags & AV_CPU_FLAG_ATOM) {
2908 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2910 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2912 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2913 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2918 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
2920 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2924 #if HAVE_AVX && HAVE_YASM
2925 if (mm_flags & AV_CPU_FLAG_AVX) {
2926 if (bit_depth == 10) {
2927 //AVX implies !cache64.
2928 //TODO: Port cache(32|64) detection from x264.
2929 H264_QPEL_FUNCS_10(1, 0, sse2)
2930 H264_QPEL_FUNCS_10(2, 0, sse2)
2931 H264_QPEL_FUNCS_10(3, 0, sse2)
2933 if (CONFIG_H264CHROMA) {
2934 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2935 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2938 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2943 if (CONFIG_ENCODERS)
2944 dsputilenc_init_mmx(c, avctx);