2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
84 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
85 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
87 #define MOVQ_BFE(regd) \
89 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
90 "paddb %%" #regd ", %%" #regd " \n\t" ::)
93 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
94 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
96 // for shared library it's better to use this way for accessing constants
98 #define MOVQ_BONE(regd) \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
104 #define MOVQ_WTWO(regd) \
106 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
107 "psrlw $15, %%" #regd " \n\t" \
108 "psllw $1, %%" #regd " \n\t"::)
112 // using regr as temporary and for the output result
113 // first argument is unmodifed and second is trashed
114 // regfe is supposed to contain 0xfefefefefefefefe
115 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
116 "movq " #rega ", " #regr " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pxor " #rega ", " #regb " \n\t"\
119 "pand " #regfe "," #regb " \n\t"\
120 "psrlq $1, " #regb " \n\t"\
121 "paddb " #regb ", " #regr " \n\t"
123 #define PAVGB_MMX(rega, regb, regr, regfe) \
124 "movq " #rega ", " #regr " \n\t"\
125 "por " #regb ", " #regr " \n\t"\
126 "pxor " #rega ", " #regb " \n\t"\
127 "pand " #regfe "," #regb " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"
131 // mm6 is supposed to contain 0xfefefefefefefefe
132 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
133 "movq " #rega ", " #regr " \n\t"\
134 "movq " #regc ", " #regp " \n\t"\
135 "pand " #regb ", " #regr " \n\t"\
136 "pand " #regd ", " #regp " \n\t"\
137 "pxor " #rega ", " #regb " \n\t"\
138 "pxor " #regc ", " #regd " \n\t"\
139 "pand %%mm6, " #regb " \n\t"\
140 "pand %%mm6, " #regd " \n\t"\
141 "psrlq $1, " #regb " \n\t"\
142 "psrlq $1, " #regd " \n\t"\
143 "paddb " #regb ", " #regr " \n\t"\
144 "paddb " #regd ", " #regp " \n\t"
146 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
147 "movq " #rega ", " #regr " \n\t"\
148 "movq " #regc ", " #regp " \n\t"\
149 "por " #regb ", " #regr " \n\t"\
150 "por " #regd ", " #regp " \n\t"\
151 "pxor " #rega ", " #regb " \n\t"\
152 "pxor " #regc ", " #regd " \n\t"\
153 "pand %%mm6, " #regb " \n\t"\
154 "pand %%mm6, " #regd " \n\t"\
155 "psrlq $1, " #regd " \n\t"\
156 "psrlq $1, " #regb " \n\t"\
157 "psubb " #regb ", " #regr " \n\t"\
158 "psubb " #regd ", " #regp " \n\t"
160 /***********************************/
161 /* MMX no rounding */
162 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
163 #define SET_RND MOVQ_WONE
164 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
165 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
166 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
168 #include "dsputil_mmx_rnd_template.c"
174 /***********************************/
177 #define DEF(x, y) x ## _ ## y ##_mmx
178 #define SET_RND MOVQ_WTWO
179 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
180 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
182 #include "dsputil_mmx_rnd_template.c"
190 /***********************************/
193 #define DEF(x) x ## _3dnow
194 #define PAVGB "pavgusb"
197 #include "dsputil_mmx_avg_template.c"
203 /***********************************/
206 #define DEF(x) x ## _mmx2
208 /* Introduced only in MMX2 set */
209 #define PAVGB "pavgb"
212 #include "dsputil_mmx_avg_template.c"
218 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
219 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
220 #define put_pixels16_mmx2 put_pixels16_mmx
221 #define put_pixels8_mmx2 put_pixels8_mmx
222 #define put_pixels4_mmx2 put_pixels4_mmx
223 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
224 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
225 #define put_pixels16_3dnow put_pixels16_mmx
226 #define put_pixels8_3dnow put_pixels8_mmx
227 #define put_pixels4_3dnow put_pixels4_mmx
228 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
229 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
231 /***********************************/
234 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
239 /* read the pixels */
244 "movq %3, %%mm0 \n\t"
245 "movq 8%3, %%mm1 \n\t"
246 "movq 16%3, %%mm2 \n\t"
247 "movq 24%3, %%mm3 \n\t"
248 "movq 32%3, %%mm4 \n\t"
249 "movq 40%3, %%mm5 \n\t"
250 "movq 48%3, %%mm6 \n\t"
251 "movq 56%3, %%mm7 \n\t"
252 "packuswb %%mm1, %%mm0 \n\t"
253 "packuswb %%mm3, %%mm2 \n\t"
254 "packuswb %%mm5, %%mm4 \n\t"
255 "packuswb %%mm7, %%mm6 \n\t"
256 "movq %%mm0, (%0) \n\t"
257 "movq %%mm2, (%0, %1) \n\t"
258 "movq %%mm4, (%0, %1, 2) \n\t"
259 "movq %%mm6, (%0, %2) \n\t"
260 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
265 // if here would be an exact copy of the code above
266 // compiler would generate some very strange code
269 "movq (%3), %%mm0 \n\t"
270 "movq 8(%3), %%mm1 \n\t"
271 "movq 16(%3), %%mm2 \n\t"
272 "movq 24(%3), %%mm3 \n\t"
273 "movq 32(%3), %%mm4 \n\t"
274 "movq 40(%3), %%mm5 \n\t"
275 "movq 48(%3), %%mm6 \n\t"
276 "movq 56(%3), %%mm7 \n\t"
277 "packuswb %%mm1, %%mm0 \n\t"
278 "packuswb %%mm3, %%mm2 \n\t"
279 "packuswb %%mm5, %%mm4 \n\t"
280 "packuswb %%mm7, %%mm6 \n\t"
281 "movq %%mm0, (%0) \n\t"
282 "movq %%mm2, (%0, %1) \n\t"
283 "movq %%mm4, (%0, %1, 2) \n\t"
284 "movq %%mm6, (%0, %2) \n\t"
285 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
289 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
290 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
292 #define put_signed_pixels_clamped_mmx_half(off) \
293 "movq "#off"(%2), %%mm1 \n\t"\
294 "movq 16+"#off"(%2), %%mm2 \n\t"\
295 "movq 32+"#off"(%2), %%mm3 \n\t"\
296 "movq 48+"#off"(%2), %%mm4 \n\t"\
297 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
298 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
299 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
300 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
301 "paddb %%mm0, %%mm1 \n\t"\
302 "paddb %%mm0, %%mm2 \n\t"\
303 "paddb %%mm0, %%mm3 \n\t"\
304 "paddb %%mm0, %%mm4 \n\t"\
305 "movq %%mm1, (%0) \n\t"\
306 "movq %%mm2, (%0, %3) \n\t"\
307 "movq %%mm3, (%0, %3, 2) \n\t"\
308 "movq %%mm4, (%0, %1) \n\t"
310 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
312 x86_reg line_skip = line_size;
316 "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
317 "lea (%3, %3, 2), %1 \n\t"
318 put_signed_pixels_clamped_mmx_half(0)
319 "lea (%0, %3, 4), %0 \n\t"
320 put_signed_pixels_clamped_mmx_half(64)
321 :"+&r" (pixels), "=&r" (line_skip3)
322 :"r" (block), "r"(line_skip)
326 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
332 /* read the pixels */
339 "movq (%2), %%mm0 \n\t"
340 "movq 8(%2), %%mm1 \n\t"
341 "movq 16(%2), %%mm2 \n\t"
342 "movq 24(%2), %%mm3 \n\t"
343 "movq %0, %%mm4 \n\t"
344 "movq %1, %%mm6 \n\t"
345 "movq %%mm4, %%mm5 \n\t"
346 "punpcklbw %%mm7, %%mm4 \n\t"
347 "punpckhbw %%mm7, %%mm5 \n\t"
348 "paddsw %%mm4, %%mm0 \n\t"
349 "paddsw %%mm5, %%mm1 \n\t"
350 "movq %%mm6, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm6 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm6, %%mm2 \n\t"
354 "paddsw %%mm5, %%mm3 \n\t"
355 "packuswb %%mm1, %%mm0 \n\t"
356 "packuswb %%mm3, %%mm2 \n\t"
357 "movq %%mm0, %0 \n\t"
358 "movq %%mm2, %1 \n\t"
359 :"+m"(*pix), "+m"(*(pix+line_size))
367 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
370 "lea (%3, %3), %%"REG_a" \n\t"
373 "movd (%1), %%mm0 \n\t"
374 "movd (%1, %3), %%mm1 \n\t"
375 "movd %%mm0, (%2) \n\t"
376 "movd %%mm1, (%2, %3) \n\t"
377 "add %%"REG_a", %1 \n\t"
378 "add %%"REG_a", %2 \n\t"
379 "movd (%1), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
387 : "+g"(h), "+r" (pixels), "+r" (block)
388 : "r"((x86_reg)line_size)
393 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396 "lea (%3, %3), %%"REG_a" \n\t"
399 "movq (%1), %%mm0 \n\t"
400 "movq (%1, %3), %%mm1 \n\t"
401 "movq %%mm0, (%2) \n\t"
402 "movq %%mm1, (%2, %3) \n\t"
403 "add %%"REG_a", %1 \n\t"
404 "add %%"REG_a", %2 \n\t"
405 "movq (%1), %%mm0 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a", %1 \n\t"
410 "add %%"REG_a", %2 \n\t"
413 : "+g"(h), "+r" (pixels), "+r" (block)
414 : "r"((x86_reg)line_size)
419 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422 "lea (%3, %3), %%"REG_a" \n\t"
425 "movq (%1), %%mm0 \n\t"
426 "movq 8(%1), %%mm4 \n\t"
427 "movq (%1, %3), %%mm1 \n\t"
428 "movq 8(%1, %3), %%mm5 \n\t"
429 "movq %%mm0, (%2) \n\t"
430 "movq %%mm4, 8(%2) \n\t"
431 "movq %%mm1, (%2, %3) \n\t"
432 "movq %%mm5, 8(%2, %3) \n\t"
433 "add %%"REG_a", %1 \n\t"
434 "add %%"REG_a", %2 \n\t"
435 "movq (%1), %%mm0 \n\t"
436 "movq 8(%1), %%mm4 \n\t"
437 "movq (%1, %3), %%mm1 \n\t"
438 "movq 8(%1, %3), %%mm5 \n\t"
439 "movq %%mm0, (%2) \n\t"
440 "movq %%mm4, 8(%2) \n\t"
441 "movq %%mm1, (%2, %3) \n\t"
442 "movq %%mm5, 8(%2, %3) \n\t"
443 "add %%"REG_a", %1 \n\t"
444 "add %%"REG_a", %2 \n\t"
447 : "+g"(h), "+r" (pixels), "+r" (block)
448 : "r"((x86_reg)line_size)
453 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
457 "movdqu (%1), %%xmm0 \n\t"
458 "movdqu (%1,%3), %%xmm1 \n\t"
459 "movdqu (%1,%3,2), %%xmm2 \n\t"
460 "movdqu (%1,%4), %%xmm3 \n\t"
461 "movdqa %%xmm0, (%2) \n\t"
462 "movdqa %%xmm1, (%2,%3) \n\t"
463 "movdqa %%xmm2, (%2,%3,2) \n\t"
464 "movdqa %%xmm3, (%2,%4) \n\t"
466 "lea (%1,%3,4), %1 \n\t"
467 "lea (%2,%3,4), %2 \n\t"
469 : "+g"(h), "+r" (pixels), "+r" (block)
470 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
475 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
479 "movdqu (%1), %%xmm0 \n\t"
480 "movdqu (%1,%3), %%xmm1 \n\t"
481 "movdqu (%1,%3,2), %%xmm2 \n\t"
482 "movdqu (%1,%4), %%xmm3 \n\t"
483 "pavgb (%2), %%xmm0 \n\t"
484 "pavgb (%2,%3), %%xmm1 \n\t"
485 "pavgb (%2,%3,2), %%xmm2 \n\t"
486 "pavgb (%2,%4), %%xmm3 \n\t"
487 "movdqa %%xmm0, (%2) \n\t"
488 "movdqa %%xmm1, (%2,%3) \n\t"
489 "movdqa %%xmm2, (%2,%3,2) \n\t"
490 "movdqa %%xmm3, (%2,%4) \n\t"
492 "lea (%1,%3,4), %1 \n\t"
493 "lea (%2,%3,4), %2 \n\t"
495 : "+g"(h), "+r" (pixels), "+r" (block)
496 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
501 #define CLEAR_BLOCKS(name,n) \
502 static void name(DCTELEM *blocks)\
505 "pxor %%mm7, %%mm7 \n\t"\
506 "mov %1, %%"REG_a" \n\t"\
508 "movq %%mm7, (%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
510 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
511 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
512 "add $32, %%"REG_a" \n\t"\
514 : : "r" (((uint8_t *)blocks)+128*n),\
519 CLEAR_BLOCKS(clear_blocks_mmx, 6)
520 CLEAR_BLOCKS(clear_block_mmx, 1)
522 static void clear_block_sse(DCTELEM *block)
525 "xorps %%xmm0, %%xmm0 \n"
526 "movaps %%xmm0, (%0) \n"
527 "movaps %%xmm0, 16(%0) \n"
528 "movaps %%xmm0, 32(%0) \n"
529 "movaps %%xmm0, 48(%0) \n"
530 "movaps %%xmm0, 64(%0) \n"
531 "movaps %%xmm0, 80(%0) \n"
532 "movaps %%xmm0, 96(%0) \n"
533 "movaps %%xmm0, 112(%0) \n"
539 static void clear_blocks_sse(DCTELEM *blocks)
542 "xorps %%xmm0, %%xmm0 \n"
543 "mov %1, %%"REG_a" \n"
545 "movaps %%xmm0, (%0, %%"REG_a") \n"
546 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
551 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
552 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
553 "add $128, %%"REG_a" \n"
555 : : "r" (((uint8_t *)blocks)+128*6),
561 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
566 "movq (%1, %0), %%mm0 \n\t"
567 "movq (%2, %0), %%mm1 \n\t"
568 "paddb %%mm0, %%mm1 \n\t"
569 "movq %%mm1, (%2, %0) \n\t"
570 "movq 8(%1, %0), %%mm0 \n\t"
571 "movq 8(%2, %0), %%mm1 \n\t"
572 "paddb %%mm0, %%mm1 \n\t"
573 "movq %%mm1, 8(%2, %0) \n\t"
579 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
582 dst[i+0] += src[i+0];
585 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
590 "movq (%2, %0), %%mm0 \n\t"
591 "movq 8(%2, %0), %%mm1 \n\t"
592 "paddb (%3, %0), %%mm0 \n\t"
593 "paddb 8(%3, %0), %%mm1 \n\t"
594 "movq %%mm0, (%1, %0) \n\t"
595 "movq %%mm1, 8(%1, %0) \n\t"
601 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
604 dst[i] = src1[i] + src2[i];
607 #if HAVE_7REGS && HAVE_TEN_OPERANDS
608 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
611 int l = *left & 0xff;
612 int tl = *left_top & 0xff;
617 "movzbl (%3,%4), %2 \n"
630 "add (%6,%4), %b0 \n"
631 "mov %b0, (%5,%4) \n"
634 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
635 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
642 #define H263_LOOP_FILTER \
643 "pxor %%mm7, %%mm7 \n\t"\
644 "movq %0, %%mm0 \n\t"\
645 "movq %0, %%mm1 \n\t"\
646 "movq %3, %%mm2 \n\t"\
647 "movq %3, %%mm3 \n\t"\
648 "punpcklbw %%mm7, %%mm0 \n\t"\
649 "punpckhbw %%mm7, %%mm1 \n\t"\
650 "punpcklbw %%mm7, %%mm2 \n\t"\
651 "punpckhbw %%mm7, %%mm3 \n\t"\
652 "psubw %%mm2, %%mm0 \n\t"\
653 "psubw %%mm3, %%mm1 \n\t"\
654 "movq %1, %%mm2 \n\t"\
655 "movq %1, %%mm3 \n\t"\
656 "movq %2, %%mm4 \n\t"\
657 "movq %2, %%mm5 \n\t"\
658 "punpcklbw %%mm7, %%mm2 \n\t"\
659 "punpckhbw %%mm7, %%mm3 \n\t"\
660 "punpcklbw %%mm7, %%mm4 \n\t"\
661 "punpckhbw %%mm7, %%mm5 \n\t"\
662 "psubw %%mm2, %%mm4 \n\t"\
663 "psubw %%mm3, %%mm5 \n\t"\
664 "psllw $2, %%mm4 \n\t"\
665 "psllw $2, %%mm5 \n\t"\
666 "paddw %%mm0, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm5 \n\t"\
668 "pxor %%mm6, %%mm6 \n\t"\
669 "pcmpgtw %%mm4, %%mm6 \n\t"\
670 "pcmpgtw %%mm5, %%mm7 \n\t"\
671 "pxor %%mm6, %%mm4 \n\t"\
672 "pxor %%mm7, %%mm5 \n\t"\
673 "psubw %%mm6, %%mm4 \n\t"\
674 "psubw %%mm7, %%mm5 \n\t"\
675 "psrlw $3, %%mm4 \n\t"\
676 "psrlw $3, %%mm5 \n\t"\
677 "packuswb %%mm5, %%mm4 \n\t"\
678 "packsswb %%mm7, %%mm6 \n\t"\
679 "pxor %%mm7, %%mm7 \n\t"\
680 "movd %4, %%mm2 \n\t"\
681 "punpcklbw %%mm2, %%mm2 \n\t"\
682 "punpcklbw %%mm2, %%mm2 \n\t"\
683 "punpcklbw %%mm2, %%mm2 \n\t"\
684 "psubusb %%mm4, %%mm2 \n\t"\
685 "movq %%mm2, %%mm3 \n\t"\
686 "psubusb %%mm4, %%mm3 \n\t"\
687 "psubb %%mm3, %%mm2 \n\t"\
688 "movq %1, %%mm3 \n\t"\
689 "movq %2, %%mm4 \n\t"\
690 "pxor %%mm6, %%mm3 \n\t"\
691 "pxor %%mm6, %%mm4 \n\t"\
692 "paddusb %%mm2, %%mm3 \n\t"\
693 "psubusb %%mm2, %%mm4 \n\t"\
694 "pxor %%mm6, %%mm3 \n\t"\
695 "pxor %%mm6, %%mm4 \n\t"\
696 "paddusb %%mm2, %%mm2 \n\t"\
697 "packsswb %%mm1, %%mm0 \n\t"\
698 "pcmpgtb %%mm0, %%mm7 \n\t"\
699 "pxor %%mm7, %%mm0 \n\t"\
700 "psubb %%mm7, %%mm0 \n\t"\
701 "movq %%mm0, %%mm1 \n\t"\
702 "psubusb %%mm2, %%mm0 \n\t"\
703 "psubb %%mm0, %%mm1 \n\t"\
704 "pand %5, %%mm1 \n\t"\
705 "psrlw $2, %%mm1 \n\t"\
706 "pxor %%mm7, %%mm1 \n\t"\
707 "psubb %%mm7, %%mm1 \n\t"\
708 "movq %0, %%mm5 \n\t"\
709 "movq %3, %%mm6 \n\t"\
710 "psubb %%mm1, %%mm5 \n\t"\
711 "paddb %%mm1, %%mm6 \n\t"
713 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
714 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
715 const int strength= ff_h263_loop_filter_strength[qscale];
721 "movq %%mm3, %1 \n\t"
722 "movq %%mm4, %2 \n\t"
723 "movq %%mm5, %0 \n\t"
724 "movq %%mm6, %3 \n\t"
725 : "+m" (*(uint64_t*)(src - 2*stride)),
726 "+m" (*(uint64_t*)(src - 1*stride)),
727 "+m" (*(uint64_t*)(src + 0*stride)),
728 "+m" (*(uint64_t*)(src + 1*stride))
729 : "g" (2*strength), "m"(ff_pb_FC)
734 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
735 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
736 const int strength= ff_h263_loop_filter_strength[qscale];
737 DECLARE_ALIGNED(8, uint64_t, temp)[4];
738 uint8_t *btemp= (uint8_t*)temp;
742 transpose4x4(btemp , src , 8, stride);
743 transpose4x4(btemp+4, src + 4*stride, 8, stride);
745 H263_LOOP_FILTER // 5 3 4 6
751 : "g" (2*strength), "m"(ff_pb_FC)
755 "movq %%mm5, %%mm1 \n\t"
756 "movq %%mm4, %%mm0 \n\t"
757 "punpcklbw %%mm3, %%mm5 \n\t"
758 "punpcklbw %%mm6, %%mm4 \n\t"
759 "punpckhbw %%mm3, %%mm1 \n\t"
760 "punpckhbw %%mm6, %%mm0 \n\t"
761 "movq %%mm5, %%mm3 \n\t"
762 "movq %%mm1, %%mm6 \n\t"
763 "punpcklwd %%mm4, %%mm5 \n\t"
764 "punpcklwd %%mm0, %%mm1 \n\t"
765 "punpckhwd %%mm4, %%mm3 \n\t"
766 "punpckhwd %%mm0, %%mm6 \n\t"
767 "movd %%mm5, (%0) \n\t"
768 "punpckhdq %%mm5, %%mm5 \n\t"
769 "movd %%mm5, (%0,%2) \n\t"
770 "movd %%mm3, (%0,%2,2) \n\t"
771 "punpckhdq %%mm3, %%mm3 \n\t"
772 "movd %%mm3, (%0,%3) \n\t"
773 "movd %%mm1, (%1) \n\t"
774 "punpckhdq %%mm1, %%mm1 \n\t"
775 "movd %%mm1, (%1,%2) \n\t"
776 "movd %%mm6, (%1,%2,2) \n\t"
777 "punpckhdq %%mm6, %%mm6 \n\t"
778 "movd %%mm6, (%1,%3) \n\t"
780 "r" (src + 4*stride),
781 "r" ((x86_reg) stride ),
782 "r" ((x86_reg)(3*stride))
787 /* draw the edges of width 'w' of an image of size width, height
788 this mmx version can only handle w==8 || w==16 */
789 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
791 uint8_t *ptr, *last_line;
794 last_line = buf + (height - 1) * wrap;
801 "movd (%0), %%mm0 \n\t"
802 "punpcklbw %%mm0, %%mm0 \n\t"
803 "punpcklwd %%mm0, %%mm0 \n\t"
804 "punpckldq %%mm0, %%mm0 \n\t"
805 "movq %%mm0, -8(%0) \n\t"
806 "movq -8(%0, %2), %%mm1 \n\t"
807 "punpckhbw %%mm1, %%mm1 \n\t"
808 "punpckhwd %%mm1, %%mm1 \n\t"
809 "punpckhdq %%mm1, %%mm1 \n\t"
810 "movq %%mm1, (%0, %2) \n\t"
815 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
822 "movd (%0), %%mm0 \n\t"
823 "punpcklbw %%mm0, %%mm0 \n\t"
824 "punpcklwd %%mm0, %%mm0 \n\t"
825 "punpckldq %%mm0, %%mm0 \n\t"
826 "movq %%mm0, -8(%0) \n\t"
827 "movq %%mm0, -16(%0) \n\t"
828 "movq -8(%0, %2), %%mm1 \n\t"
829 "punpckhbw %%mm1, %%mm1 \n\t"
830 "punpckhwd %%mm1, %%mm1 \n\t"
831 "punpckhdq %%mm1, %%mm1 \n\t"
832 "movq %%mm1, (%0, %2) \n\t"
833 "movq %%mm1, 8(%0, %2) \n\t"
838 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
843 /* top and bottom (and hopefully also the corners) */
844 ptr= buf - (i + 1) * wrap - w;
847 "movq (%1, %0), %%mm0 \n\t"
848 "movq %%mm0, (%0) \n\t"
849 "movq %%mm0, (%0, %2) \n\t"
850 "movq %%mm0, (%0, %2, 2) \n\t"
851 "movq %%mm0, (%0, %3) \n\t"
856 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
858 ptr= last_line + (i + 1) * wrap - w;
861 "movq (%1, %0), %%mm0 \n\t"
862 "movq %%mm0, (%0) \n\t"
863 "movq %%mm0, (%0, %2) \n\t"
864 "movq %%mm0, (%0, %2, 2) \n\t"
865 "movq %%mm0, (%0, %3) \n\t"
870 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
875 #define PAETH(cpu, abs3)\
876 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
881 "pxor %%mm7, %%mm7 \n"\
882 "movd (%1,%0), %%mm0 \n"\
883 "movd (%2,%0), %%mm1 \n"\
884 "punpcklbw %%mm7, %%mm0 \n"\
885 "punpcklbw %%mm7, %%mm1 \n"\
888 "movq %%mm1, %%mm2 \n"\
889 "movd (%2,%0), %%mm1 \n"\
890 "movq %%mm2, %%mm3 \n"\
891 "punpcklbw %%mm7, %%mm1 \n"\
892 "movq %%mm2, %%mm4 \n"\
893 "psubw %%mm1, %%mm3 \n"\
894 "psubw %%mm0, %%mm4 \n"\
895 "movq %%mm3, %%mm5 \n"\
896 "paddw %%mm4, %%mm5 \n"\
898 "movq %%mm4, %%mm6 \n"\
899 "pminsw %%mm5, %%mm6 \n"\
900 "pcmpgtw %%mm6, %%mm3 \n"\
901 "pcmpgtw %%mm5, %%mm4 \n"\
902 "movq %%mm4, %%mm6 \n"\
903 "pand %%mm3, %%mm4 \n"\
904 "pandn %%mm3, %%mm6 \n"\
905 "pandn %%mm0, %%mm3 \n"\
906 "movd (%3,%0), %%mm0 \n"\
907 "pand %%mm1, %%mm6 \n"\
908 "pand %%mm4, %%mm2 \n"\
909 "punpcklbw %%mm7, %%mm0 \n"\
911 "paddw %%mm6, %%mm0 \n"\
912 "paddw %%mm2, %%mm3 \n"\
913 "paddw %%mm3, %%mm0 \n"\
914 "pand %%mm5, %%mm0 \n"\
915 "movq %%mm0, %%mm3 \n"\
916 "packuswb %%mm3, %%mm3 \n"\
917 "movd %%mm3, (%1,%0) \n"\
922 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
929 "psubw %%mm5, %%mm7 \n"\
930 "pmaxsw %%mm7, %%mm5 \n"\
931 "pxor %%mm6, %%mm6 \n"\
932 "pxor %%mm7, %%mm7 \n"\
933 "psubw %%mm3, %%mm6 \n"\
934 "psubw %%mm4, %%mm7 \n"\
935 "pmaxsw %%mm6, %%mm3 \n"\
936 "pmaxsw %%mm7, %%mm4 \n"\
937 "pxor %%mm7, %%mm7 \n"
940 "pabsw %%mm3, %%mm3 \n"\
941 "pabsw %%mm4, %%mm4 \n"\
942 "pabsw %%mm5, %%mm5 \n"
944 PAETH(mmx2, ABS3_MMX2)
946 PAETH(ssse3, ABS3_SSSE3)
949 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
950 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
951 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
952 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
953 "movq "#in7", " #m3 " \n\t" /* d */\
954 "movq "#in0", %%mm5 \n\t" /* D */\
955 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
956 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
957 "movq "#in1", %%mm5 \n\t" /* C */\
958 "movq "#in2", %%mm6 \n\t" /* B */\
959 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
960 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
961 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
962 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
963 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
964 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
965 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
966 "psraw $5, %%mm5 \n\t"\
967 "packuswb %%mm5, %%mm5 \n\t"\
968 OP(%%mm5, out, %%mm7, d)
970 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
971 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
975 "pxor %%mm7, %%mm7 \n\t"\
977 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
978 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
979 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
980 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
981 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
982 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
983 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
984 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
985 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
986 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
987 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
988 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
989 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
990 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
991 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
992 "paddw %%mm3, %%mm5 \n\t" /* b */\
993 "paddw %%mm2, %%mm6 \n\t" /* c */\
994 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
995 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
996 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
997 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
998 "paddw %%mm4, %%mm0 \n\t" /* a */\
999 "paddw %%mm1, %%mm5 \n\t" /* d */\
1000 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1001 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1002 "paddw %6, %%mm6 \n\t"\
1003 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1004 "psraw $5, %%mm0 \n\t"\
1005 "movq %%mm0, %5 \n\t"\
1006 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1008 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1009 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1010 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1011 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1012 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1013 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1014 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1015 "paddw %%mm0, %%mm2 \n\t" /* b */\
1016 "paddw %%mm5, %%mm3 \n\t" /* c */\
1017 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1018 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1019 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1020 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1021 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1022 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1023 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1024 "paddw %%mm2, %%mm1 \n\t" /* a */\
1025 "paddw %%mm6, %%mm4 \n\t" /* d */\
1026 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1027 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1028 "paddw %6, %%mm1 \n\t"\
1029 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1030 "psraw $5, %%mm3 \n\t"\
1031 "movq %5, %%mm1 \n\t"\
1032 "packuswb %%mm3, %%mm1 \n\t"\
1033 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1034 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1036 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1037 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1038 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1039 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1040 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1041 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1042 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1043 "paddw %%mm1, %%mm5 \n\t" /* b */\
1044 "paddw %%mm4, %%mm0 \n\t" /* c */\
1045 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1046 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1047 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1048 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1049 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1050 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1051 "paddw %%mm3, %%mm2 \n\t" /* d */\
1052 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1053 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1054 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1055 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1056 "paddw %%mm2, %%mm6 \n\t" /* a */\
1057 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1058 "paddw %6, %%mm0 \n\t"\
1059 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1060 "psraw $5, %%mm0 \n\t"\
1061 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1063 "paddw %%mm5, %%mm3 \n\t" /* a */\
1064 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1065 "paddw %%mm4, %%mm6 \n\t" /* b */\
1066 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1067 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1068 "paddw %%mm1, %%mm4 \n\t" /* c */\
1069 "paddw %%mm2, %%mm5 \n\t" /* d */\
1070 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1071 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1072 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1073 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1074 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1075 "paddw %6, %%mm4 \n\t"\
1076 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1077 "psraw $5, %%mm4 \n\t"\
1078 "packuswb %%mm4, %%mm0 \n\t"\
1079 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1085 : "+a"(src), "+c"(dst), "+D"(h)\
1086 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1091 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1094 /* quick HACK, XXX FIXME MUST be optimized */\
1097 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1098 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1099 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1100 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1101 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1102 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1103 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1104 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1105 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1106 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1107 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1108 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1109 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1110 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1111 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1112 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1114 "movq (%0), %%mm0 \n\t"\
1115 "movq 8(%0), %%mm1 \n\t"\
1116 "paddw %2, %%mm0 \n\t"\
1117 "paddw %2, %%mm1 \n\t"\
1118 "psraw $5, %%mm0 \n\t"\
1119 "psraw $5, %%mm1 \n\t"\
1120 "packuswb %%mm1, %%mm0 \n\t"\
1121 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1122 "movq 16(%0), %%mm0 \n\t"\
1123 "movq 24(%0), %%mm1 \n\t"\
1124 "paddw %2, %%mm0 \n\t"\
1125 "paddw %2, %%mm1 \n\t"\
1126 "psraw $5, %%mm0 \n\t"\
1127 "psraw $5, %%mm1 \n\t"\
1128 "packuswb %%mm1, %%mm0 \n\t"\
1129 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1130 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1138 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1140 "pxor %%mm7, %%mm7 \n\t"\
1142 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1143 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1144 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1145 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1146 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1147 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1148 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1149 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1150 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1151 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1152 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1153 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1154 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1155 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1156 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1157 "paddw %%mm3, %%mm5 \n\t" /* b */\
1158 "paddw %%mm2, %%mm6 \n\t" /* c */\
1159 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1160 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1161 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1162 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1163 "paddw %%mm4, %%mm0 \n\t" /* a */\
1164 "paddw %%mm1, %%mm5 \n\t" /* d */\
1165 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1166 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1167 "paddw %5, %%mm6 \n\t"\
1168 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1169 "psraw $5, %%mm0 \n\t"\
1170 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1172 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1173 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1174 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1175 "paddw %%mm5, %%mm1 \n\t" /* a */\
1176 "paddw %%mm6, %%mm2 \n\t" /* b */\
1177 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1178 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1179 "paddw %%mm6, %%mm3 \n\t" /* c */\
1180 "paddw %%mm5, %%mm4 \n\t" /* d */\
1181 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1182 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1183 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1184 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1185 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1186 "paddw %5, %%mm1 \n\t"\
1187 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1188 "psraw $5, %%mm3 \n\t"\
1189 "packuswb %%mm3, %%mm0 \n\t"\
1190 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1196 : "+a"(src), "+c"(dst), "+d"(h)\
1197 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1202 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1205 /* quick HACK, XXX FIXME MUST be optimized */\
1208 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1209 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1210 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1211 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1212 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1213 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1214 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1215 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1217 "movq (%0), %%mm0 \n\t"\
1218 "movq 8(%0), %%mm1 \n\t"\
1219 "paddw %2, %%mm0 \n\t"\
1220 "paddw %2, %%mm1 \n\t"\
1221 "psraw $5, %%mm0 \n\t"\
1222 "psraw $5, %%mm1 \n\t"\
1223 "packuswb %%mm1, %%mm0 \n\t"\
1224 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1225 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1233 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1235 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1236 uint64_t temp[17*4];\
1237 uint64_t *temp_ptr= temp;\
1242 "pxor %%mm7, %%mm7 \n\t"\
1244 "movq (%0), %%mm0 \n\t"\
1245 "movq (%0), %%mm1 \n\t"\
1246 "movq 8(%0), %%mm2 \n\t"\
1247 "movq 8(%0), %%mm3 \n\t"\
1248 "punpcklbw %%mm7, %%mm0 \n\t"\
1249 "punpckhbw %%mm7, %%mm1 \n\t"\
1250 "punpcklbw %%mm7, %%mm2 \n\t"\
1251 "punpckhbw %%mm7, %%mm3 \n\t"\
1252 "movq %%mm0, (%1) \n\t"\
1253 "movq %%mm1, 17*8(%1) \n\t"\
1254 "movq %%mm2, 2*17*8(%1) \n\t"\
1255 "movq %%mm3, 3*17*8(%1) \n\t"\
1260 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1261 : "r" ((x86_reg)srcStride)\
1268 /*FIXME reorder for speed */\
1270 /*"pxor %%mm7, %%mm7 \n\t"*/\
1272 "movq (%0), %%mm0 \n\t"\
1273 "movq 8(%0), %%mm1 \n\t"\
1274 "movq 16(%0), %%mm2 \n\t"\
1275 "movq 24(%0), %%mm3 \n\t"\
1276 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1277 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1279 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1281 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1283 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1284 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1286 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1287 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1289 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1290 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1292 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1293 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1295 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1297 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1299 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1300 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1302 "add $136, %0 \n\t"\
1307 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1308 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1313 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1314 uint64_t temp[9*2];\
1315 uint64_t *temp_ptr= temp;\
1320 "pxor %%mm7, %%mm7 \n\t"\
1322 "movq (%0), %%mm0 \n\t"\
1323 "movq (%0), %%mm1 \n\t"\
1324 "punpcklbw %%mm7, %%mm0 \n\t"\
1325 "punpckhbw %%mm7, %%mm1 \n\t"\
1326 "movq %%mm0, (%1) \n\t"\
1327 "movq %%mm1, 9*8(%1) \n\t"\
1332 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1333 : "r" ((x86_reg)srcStride)\
1340 /*FIXME reorder for speed */\
1342 /*"pxor %%mm7, %%mm7 \n\t"*/\
1344 "movq (%0), %%mm0 \n\t"\
1345 "movq 8(%0), %%mm1 \n\t"\
1346 "movq 16(%0), %%mm2 \n\t"\
1347 "movq 24(%0), %%mm3 \n\t"\
1348 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1349 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1351 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1353 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1355 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1357 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1359 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1360 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1367 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1368 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1373 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1374 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1377 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1379 uint8_t * const half= (uint8_t*)temp;\
1380 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1381 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1384 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1385 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1388 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1390 uint8_t * const half= (uint8_t*)temp;\
1391 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1392 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1395 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1397 uint8_t * const half= (uint8_t*)temp;\
1398 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1399 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1402 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1403 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1406 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1408 uint8_t * const half= (uint8_t*)temp;\
1409 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1410 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1412 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1413 uint64_t half[8 + 9];\
1414 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1415 uint8_t * const halfHV= ((uint8_t*)half);\
1416 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1417 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1418 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1419 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1421 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1422 uint64_t half[8 + 9];\
1423 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1424 uint8_t * const halfHV= ((uint8_t*)half);\
1425 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1426 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1427 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1428 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1430 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1431 uint64_t half[8 + 9];\
1432 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1433 uint8_t * const halfHV= ((uint8_t*)half);\
1434 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1435 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1436 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1437 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1439 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1440 uint64_t half[8 + 9];\
1441 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1442 uint8_t * const halfHV= ((uint8_t*)half);\
1443 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1444 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1445 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1446 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1448 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1449 uint64_t half[8 + 9];\
1450 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1451 uint8_t * const halfHV= ((uint8_t*)half);\
1452 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1453 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1454 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1456 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1457 uint64_t half[8 + 9];\
1458 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1459 uint8_t * const halfHV= ((uint8_t*)half);\
1460 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1461 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1462 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1464 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1465 uint64_t half[8 + 9];\
1466 uint8_t * const halfH= ((uint8_t*)half);\
1467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1468 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1469 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1471 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1472 uint64_t half[8 + 9];\
1473 uint8_t * const halfH= ((uint8_t*)half);\
1474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1475 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1476 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1478 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1480 uint8_t * const halfH= ((uint8_t*)half);\
1481 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1482 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1484 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1485 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1488 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1490 uint8_t * const half= (uint8_t*)temp;\
1491 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1492 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1495 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1496 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1499 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1501 uint8_t * const half= (uint8_t*)temp;\
1502 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1503 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1506 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1508 uint8_t * const half= (uint8_t*)temp;\
1509 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1510 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1513 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1514 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1517 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1519 uint8_t * const half= (uint8_t*)temp;\
1520 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1521 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1523 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1524 uint64_t half[16*2 + 17*2];\
1525 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1526 uint8_t * const halfHV= ((uint8_t*)half);\
1527 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1528 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1529 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1530 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1532 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1533 uint64_t half[16*2 + 17*2];\
1534 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1535 uint8_t * const halfHV= ((uint8_t*)half);\
1536 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1537 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1538 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1539 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1541 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1542 uint64_t half[16*2 + 17*2];\
1543 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1544 uint8_t * const halfHV= ((uint8_t*)half);\
1545 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1546 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1547 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1548 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1550 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1551 uint64_t half[16*2 + 17*2];\
1552 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1553 uint8_t * const halfHV= ((uint8_t*)half);\
1554 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1555 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1556 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1557 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1559 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1560 uint64_t half[16*2 + 17*2];\
1561 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1562 uint8_t * const halfHV= ((uint8_t*)half);\
1563 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1564 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1565 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1567 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1568 uint64_t half[16*2 + 17*2];\
1569 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1570 uint8_t * const halfHV= ((uint8_t*)half);\
1571 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1572 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1573 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1575 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1576 uint64_t half[17*2];\
1577 uint8_t * const halfH= ((uint8_t*)half);\
1578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1579 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1580 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1582 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1583 uint64_t half[17*2];\
1584 uint8_t * const halfH= ((uint8_t*)half);\
1585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1586 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1587 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1589 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1590 uint64_t half[17*2];\
1591 uint8_t * const halfH= ((uint8_t*)half);\
1592 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1593 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1596 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1597 #define AVG_3DNOW_OP(a,b,temp, size) \
1598 "mov" #size " " #b ", " #temp " \n\t"\
1599 "pavgusb " #temp ", " #a " \n\t"\
1600 "mov" #size " " #a ", " #b " \n\t"
1601 #define AVG_MMX2_OP(a,b,temp, size) \
1602 "mov" #size " " #b ", " #temp " \n\t"\
1603 "pavgb " #temp ", " #a " \n\t"\
1604 "mov" #size " " #a ", " #b " \n\t"
1606 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1607 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1608 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1609 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1610 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1611 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1612 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1613 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1614 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1616 /***********************************/
1617 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1619 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1621 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1623 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1624 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1625 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1628 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1629 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1630 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1631 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1632 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1633 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1634 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1635 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1636 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1637 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1638 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1639 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1641 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1642 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1644 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1645 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1646 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1647 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1648 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1649 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1650 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1653 QPEL_2TAP(put_, 16, mmx2)
1654 QPEL_2TAP(avg_, 16, mmx2)
1655 QPEL_2TAP(put_, 8, mmx2)
1656 QPEL_2TAP(avg_, 8, mmx2)
1657 QPEL_2TAP(put_, 16, 3dnow)
1658 QPEL_2TAP(avg_, 16, 3dnow)
1659 QPEL_2TAP(put_, 8, 3dnow)
1660 QPEL_2TAP(avg_, 8, 3dnow)
1664 static void just_return(void) { return; }
1668 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1669 x86_reg linesize, x86_reg start_y,
1670 x86_reg end_y, x86_reg block_h,
1671 x86_reg start_x, x86_reg end_x,
1673 extern emu_edge_core_func ff_emu_edge_core_mmx;
1674 extern emu_edge_core_func ff_emu_edge_core_sse;
1676 static av_always_inline
1677 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1678 int block_w, int block_h,
1679 int src_x, int src_y, int w, int h,
1680 emu_edge_core_func *core_fn)
1682 int start_y, start_x, end_y, end_x, src_y_add=0;
1685 src_y_add = h-1-src_y;
1687 }else if(src_y<=-block_h){
1688 src_y_add = 1-block_h-src_y;
1694 }else if(src_x<=-block_w){
1695 src+= (1-block_w-src_x);
1699 start_y= FFMAX(0, -src_y);
1700 start_x= FFMAX(0, -src_x);
1701 end_y= FFMIN(block_h, h-src_y);
1702 end_x= FFMIN(block_w, w-src_x);
1703 assert(start_x < end_x && block_w > 0);
1704 assert(start_y < end_y && block_h > 0);
1706 // fill in the to-be-copied part plus all above/below
1707 src += (src_y_add+start_y)*linesize + start_x;
1709 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1714 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1715 int block_w, int block_h,
1716 int src_x, int src_y, int w, int h)
1718 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1719 w, h, &ff_emu_edge_core_mmx);
1723 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1724 int block_w, int block_h,
1725 int src_x, int src_y, int w, int h)
1727 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1728 w, h, &ff_emu_edge_core_sse);
1730 #endif /* HAVE_YASM */
1732 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1733 int linesize, int block_w, int block_h,
1734 int src_x, int src_y, int w, int h);
1736 static av_always_inline
1737 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1738 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1739 emulated_edge_mc_func *emu_edge_fn)
1742 const int ix = ox>>(16+shift);
1743 const int iy = oy>>(16+shift);
1744 const int oxs = ox>>4;
1745 const int oys = oy>>4;
1746 const int dxxs = dxx>>4;
1747 const int dxys = dxy>>4;
1748 const int dyxs = dyx>>4;
1749 const int dyys = dyy>>4;
1750 const uint16_t r4[4] = {r,r,r,r};
1751 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1752 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1753 const uint64_t shift2 = 2*shift;
1754 uint8_t edge_buf[(h+1)*stride];
1757 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1758 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1759 const int dxh = dxy*(h-1);
1760 const int dyw = dyx*(w-1);
1761 if( // non-constant fullpel offset (3% of blocks)
1762 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1763 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1764 // uses more than 16 bits of subpel mv (only at huge resolution)
1765 || (dxx|dxy|dyx|dyy)&15 )
1767 //FIXME could still use mmx for some of the rows
1768 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1772 src += ix + iy*stride;
1773 if( (unsigned)ix >= width-w ||
1774 (unsigned)iy >= height-h )
1776 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1781 "movd %0, %%mm6 \n\t"
1782 "pxor %%mm7, %%mm7 \n\t"
1783 "punpcklwd %%mm6, %%mm6 \n\t"
1784 "punpcklwd %%mm6, %%mm6 \n\t"
1788 for(x=0; x<w; x+=4){
1789 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1790 oxs - dxys + dxxs*(x+1),
1791 oxs - dxys + dxxs*(x+2),
1792 oxs - dxys + dxxs*(x+3) };
1793 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1794 oys - dyys + dyxs*(x+1),
1795 oys - dyys + dyxs*(x+2),
1796 oys - dyys + dyxs*(x+3) };
1800 "movq %0, %%mm4 \n\t"
1801 "movq %1, %%mm5 \n\t"
1802 "paddw %2, %%mm4 \n\t"
1803 "paddw %3, %%mm5 \n\t"
1804 "movq %%mm4, %0 \n\t"
1805 "movq %%mm5, %1 \n\t"
1806 "psrlw $12, %%mm4 \n\t"
1807 "psrlw $12, %%mm5 \n\t"
1808 : "+m"(*dx4), "+m"(*dy4)
1809 : "m"(*dxy4), "m"(*dyy4)
1813 "movq %%mm6, %%mm2 \n\t"
1814 "movq %%mm6, %%mm1 \n\t"
1815 "psubw %%mm4, %%mm2 \n\t"
1816 "psubw %%mm5, %%mm1 \n\t"
1817 "movq %%mm2, %%mm0 \n\t"
1818 "movq %%mm4, %%mm3 \n\t"
1819 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1820 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1821 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1822 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1824 "movd %4, %%mm5 \n\t"
1825 "movd %3, %%mm4 \n\t"
1826 "punpcklbw %%mm7, %%mm5 \n\t"
1827 "punpcklbw %%mm7, %%mm4 \n\t"
1828 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1829 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1831 "movd %2, %%mm5 \n\t"
1832 "movd %1, %%mm4 \n\t"
1833 "punpcklbw %%mm7, %%mm5 \n\t"
1834 "punpcklbw %%mm7, %%mm4 \n\t"
1835 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1836 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1837 "paddw %5, %%mm1 \n\t"
1838 "paddw %%mm3, %%mm2 \n\t"
1839 "paddw %%mm1, %%mm0 \n\t"
1840 "paddw %%mm2, %%mm0 \n\t"
1842 "psrlw %6, %%mm0 \n\t"
1843 "packuswb %%mm0, %%mm0 \n\t"
1844 "movd %%mm0, %0 \n\t"
1846 : "=m"(dst[x+y*stride])
1847 : "m"(src[0]), "m"(src[1]),
1848 "m"(src[stride]), "m"(src[stride+1]),
1849 "m"(*r4), "m"(shift2)
1859 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1860 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1862 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1863 width, height, &emulated_edge_mc_mmx);
1866 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1867 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1869 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1870 width, height, &emulated_edge_mc_sse);
1873 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1874 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1876 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1877 width, height, &ff_emulated_edge_mc);
1881 #define PREFETCH(name, op) \
1882 static void name(void *mem, int stride, int h){\
1883 const uint8_t *p= mem;\
1885 __asm__ volatile(#op" %0" :: "m"(*p));\
1889 PREFETCH(prefetch_mmx2, prefetcht0)
1890 PREFETCH(prefetch_3dnow, prefetch)
1893 #include "h264_qpel_mmx.c"
1895 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1896 int stride, int h, int x, int y);
1897 void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src,
1898 int stride, int h, int x, int y);
1899 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1900 int stride, int h, int x, int y);
1901 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1902 int stride, int h, int x, int y);
1903 void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src,
1904 int stride, int h, int x, int y);
1905 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1906 int stride, int h, int x, int y);
1907 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1908 int stride, int h, int x, int y);
1909 void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src,
1910 int stride, int h, int x, int y);
1911 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1912 int stride, int h, int x, int y);
1914 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1915 int stride, int h, int x, int y);
1916 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1917 int stride, int h, int x, int y);
1918 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1919 int stride, int h, int x, int y);
1920 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1921 int stride, int h, int x, int y);
1922 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1923 int stride, int h, int x, int y);
1924 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1925 int stride, int h, int x, int y);
1927 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1928 int stride, int h, int x, int y);
1929 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1930 int stride, int h, int x, int y);
1932 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1933 int stride, int h, int x, int y);
1934 void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
1935 int stride, int h, int x, int y);
1936 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1937 int stride, int h, int x, int y);
1939 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1940 int stride, int h, int x, int y);
1941 void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
1942 int stride, int h, int x, int y);
1943 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1944 int stride, int h, int x, int y);
1948 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1949 put_pixels8_mmx(dst, src, stride, 8);
1951 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1952 avg_pixels8_mmx(dst, src, stride, 8);
1954 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1955 put_pixels16_mmx(dst, src, stride, 16);
1957 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1958 avg_pixels16_mmx(dst, src, stride, 16);
1962 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1963 put_pixels8_mmx(dst, src, stride, 8);
1965 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1966 avg_pixels8_mmx2(dst, src, stride, 8);
1969 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1972 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1974 ff_mmx_idct (block);
1975 ff_put_pixels_clamped_mmx(block, dest, line_size);
1977 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1979 ff_mmx_idct (block);
1980 ff_add_pixels_clamped_mmx(block, dest, line_size);
1982 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1984 ff_mmxext_idct (block);
1985 ff_put_pixels_clamped_mmx(block, dest, line_size);
1987 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1989 ff_mmxext_idct (block);
1990 ff_add_pixels_clamped_mmx(block, dest, line_size);
1993 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1995 ff_idct_xvid_mmx (block);
1996 ff_put_pixels_clamped_mmx(block, dest, line_size);
1998 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2000 ff_idct_xvid_mmx (block);
2001 ff_add_pixels_clamped_mmx(block, dest, line_size);
2003 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2005 ff_idct_xvid_mmx2 (block);
2006 ff_put_pixels_clamped_mmx(block, dest, line_size);
2008 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2010 ff_idct_xvid_mmx2 (block);
2011 ff_add_pixels_clamped_mmx(block, dest, line_size);
2014 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2017 __asm__ volatile("pxor %%mm7, %%mm7":);
2018 for(i=0; i<blocksize; i+=2) {
2020 "movq %0, %%mm0 \n\t"
2021 "movq %1, %%mm1 \n\t"
2022 "movq %%mm0, %%mm2 \n\t"
2023 "movq %%mm1, %%mm3 \n\t"
2024 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2025 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2026 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2027 "pxor %%mm2, %%mm1 \n\t"
2028 "movq %%mm3, %%mm4 \n\t"
2029 "pand %%mm1, %%mm3 \n\t"
2030 "pandn %%mm1, %%mm4 \n\t"
2031 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2032 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2033 "movq %%mm3, %1 \n\t"
2034 "movq %%mm0, %0 \n\t"
2035 :"+m"(mag[i]), "+m"(ang[i])
2039 __asm__ volatile("femms");
2041 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2046 "movaps %0, %%xmm5 \n\t"
2047 ::"m"(ff_pdw_80000000[0])
2049 for(i=0; i<blocksize; i+=4) {
2051 "movaps %0, %%xmm0 \n\t"
2052 "movaps %1, %%xmm1 \n\t"
2053 "xorps %%xmm2, %%xmm2 \n\t"
2054 "xorps %%xmm3, %%xmm3 \n\t"
2055 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2056 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2057 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2058 "xorps %%xmm2, %%xmm1 \n\t"
2059 "movaps %%xmm3, %%xmm4 \n\t"
2060 "andps %%xmm1, %%xmm3 \n\t"
2061 "andnps %%xmm1, %%xmm4 \n\t"
2062 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2063 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2064 "movaps %%xmm3, %1 \n\t"
2065 "movaps %%xmm0, %0 \n\t"
2066 :"+m"(mag[i]), "+m"(ang[i])
2075 #define MIX5(mono,stereo)\
2077 "movss 0(%2), %%xmm5 \n"\
2078 "movss 8(%2), %%xmm6 \n"\
2079 "movss 24(%2), %%xmm7 \n"\
2080 "shufps $0, %%xmm5, %%xmm5 \n"\
2081 "shufps $0, %%xmm6, %%xmm6 \n"\
2082 "shufps $0, %%xmm7, %%xmm7 \n"\
2084 "movaps (%0,%1), %%xmm0 \n"\
2085 "movaps 0x400(%0,%1), %%xmm1 \n"\
2086 "movaps 0x800(%0,%1), %%xmm2 \n"\
2087 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2088 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2089 "mulps %%xmm5, %%xmm0 \n"\
2090 "mulps %%xmm6, %%xmm1 \n"\
2091 "mulps %%xmm5, %%xmm2 \n"\
2092 "mulps %%xmm7, %%xmm3 \n"\
2093 "mulps %%xmm7, %%xmm4 \n"\
2094 stereo("addps %%xmm1, %%xmm0 \n")\
2095 "addps %%xmm1, %%xmm2 \n"\
2096 "addps %%xmm3, %%xmm0 \n"\
2097 "addps %%xmm4, %%xmm2 \n"\
2098 mono("addps %%xmm2, %%xmm0 \n")\
2099 "movaps %%xmm0, (%0,%1) \n"\
2100 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2104 :"r"(samples[0]+len), "r"(matrix)\
2105 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2106 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2110 #define MIX_MISC(stereo)\
2113 "movaps (%3,%0), %%xmm0 \n"\
2114 stereo("movaps %%xmm0, %%xmm1 \n")\
2115 "mulps %%xmm4, %%xmm0 \n"\
2116 stereo("mulps %%xmm5, %%xmm1 \n")\
2117 "lea 1024(%3,%0), %1 \n"\
2120 "movaps (%1), %%xmm2 \n"\
2121 stereo("movaps %%xmm2, %%xmm3 \n")\
2122 "mulps (%4,%2), %%xmm2 \n"\
2123 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2124 "addps %%xmm2, %%xmm0 \n"\
2125 stereo("addps %%xmm3, %%xmm1 \n")\
2129 "movaps %%xmm0, (%3,%0) \n"\
2130 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2133 :"+&r"(i), "=&r"(j), "=&r"(k)\
2134 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2138 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2140 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2143 i = -len*sizeof(float);
2144 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2146 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2149 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2150 j = 2*in_ch*sizeof(float);
2154 "movss (%2,%0), %%xmm4 \n"
2155 "movss 4(%2,%0), %%xmm5 \n"
2156 "shufps $0, %%xmm4, %%xmm4 \n"
2157 "shufps $0, %%xmm5, %%xmm5 \n"
2158 "movaps %%xmm4, (%1,%0,4) \n"
2159 "movaps %%xmm5, 16(%1,%0,4) \n"
2162 :"r"(matrix_simd), "r"(matrix)
2173 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2174 x86_reg i = (len-4)*4;
2177 "movq (%2,%0), %%mm0 \n\t"
2178 "movq 8(%2,%0), %%mm1 \n\t"
2179 "pfmul (%3,%0), %%mm0 \n\t"
2180 "pfmul 8(%3,%0), %%mm1 \n\t"
2181 "movq %%mm0, (%1,%0) \n\t"
2182 "movq %%mm1, 8(%1,%0) \n\t"
2187 :"r"(dst), "r"(src0), "r"(src1)
2191 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2192 x86_reg i = (len-8)*4;
2195 "movaps (%2,%0), %%xmm0 \n\t"
2196 "movaps 16(%2,%0), %%xmm1 \n\t"
2197 "mulps (%3,%0), %%xmm0 \n\t"
2198 "mulps 16(%3,%0), %%xmm1 \n\t"
2199 "movaps %%xmm0, (%1,%0) \n\t"
2200 "movaps %%xmm1, 16(%1,%0) \n\t"
2204 :"r"(dst), "r"(src0), "r"(src1)
2209 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2210 x86_reg i = len*4-16;
2213 "pswapd 8(%1), %%mm0 \n\t"
2214 "pswapd (%1), %%mm1 \n\t"
2215 "pfmul (%3,%0), %%mm0 \n\t"
2216 "pfmul 8(%3,%0), %%mm1 \n\t"
2217 "movq %%mm0, (%2,%0) \n\t"
2218 "movq %%mm1, 8(%2,%0) \n\t"
2222 :"+r"(i), "+r"(src1)
2223 :"r"(dst), "r"(src0)
2225 __asm__ volatile("femms");
2227 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2228 x86_reg i = len*4-32;
2231 "movaps 16(%1), %%xmm0 \n\t"
2232 "movaps (%1), %%xmm1 \n\t"
2233 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2234 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2235 "mulps (%3,%0), %%xmm0 \n\t"
2236 "mulps 16(%3,%0), %%xmm1 \n\t"
2237 "movaps %%xmm0, (%2,%0) \n\t"
2238 "movaps %%xmm1, 16(%2,%0) \n\t"
2242 :"+r"(i), "+r"(src1)
2243 :"r"(dst), "r"(src0)
2247 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2248 const float *src2, int len){
2249 x86_reg i = (len-4)*4;
2252 "movq (%2,%0), %%mm0 \n\t"
2253 "movq 8(%2,%0), %%mm1 \n\t"
2254 "pfmul (%3,%0), %%mm0 \n\t"
2255 "pfmul 8(%3,%0), %%mm1 \n\t"
2256 "pfadd (%4,%0), %%mm0 \n\t"
2257 "pfadd 8(%4,%0), %%mm1 \n\t"
2258 "movq %%mm0, (%1,%0) \n\t"
2259 "movq %%mm1, 8(%1,%0) \n\t"
2263 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2266 __asm__ volatile("femms");
2268 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2269 const float *src2, int len){
2270 x86_reg i = (len-8)*4;
2273 "movaps (%2,%0), %%xmm0 \n\t"
2274 "movaps 16(%2,%0), %%xmm1 \n\t"
2275 "mulps (%3,%0), %%xmm0 \n\t"
2276 "mulps 16(%3,%0), %%xmm1 \n\t"
2277 "addps (%4,%0), %%xmm0 \n\t"
2278 "addps 16(%4,%0), %%xmm1 \n\t"
2279 "movaps %%xmm0, (%1,%0) \n\t"
2280 "movaps %%xmm1, 16(%1,%0) \n\t"
2284 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2290 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2291 const float *win, int len){
2293 x86_reg j = len*4-8;
2296 "pswapd (%5,%1), %%mm1 \n"
2297 "movq (%5,%0), %%mm0 \n"
2298 "pswapd (%4,%1), %%mm5 \n"
2299 "movq (%3,%0), %%mm4 \n"
2300 "movq %%mm0, %%mm2 \n"
2301 "movq %%mm1, %%mm3 \n"
2302 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2303 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2304 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2305 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2306 "pfadd %%mm3, %%mm2 \n"
2307 "pfsub %%mm0, %%mm1 \n"
2308 "pswapd %%mm2, %%mm2 \n"
2309 "movq %%mm1, (%2,%0) \n"
2310 "movq %%mm2, (%2,%1) \n"
2316 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2320 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2321 const float *win, int len){
2323 x86_reg j = len*4-16;
2326 "movaps (%5,%1), %%xmm1 \n"
2327 "movaps (%5,%0), %%xmm0 \n"
2328 "movaps (%4,%1), %%xmm5 \n"
2329 "movaps (%3,%0), %%xmm4 \n"
2330 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2331 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2332 "movaps %%xmm0, %%xmm2 \n"
2333 "movaps %%xmm1, %%xmm3 \n"
2334 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2335 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2336 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2337 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2338 "addps %%xmm3, %%xmm2 \n"
2339 "subps %%xmm0, %%xmm1 \n"
2340 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2341 "movaps %%xmm1, (%2,%0) \n"
2342 "movaps %%xmm2, (%2,%1) \n"
2347 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2350 #endif /* HAVE_6REGS */
2352 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
2356 "movss %3, %%xmm4 \n"
2357 "shufps $0, %%xmm4, %%xmm4 \n"
2359 "cvtpi2ps (%2,%0), %%xmm0 \n"
2360 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
2361 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
2362 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
2363 "movlhps %%xmm1, %%xmm0 \n"
2364 "movlhps %%xmm3, %%xmm2 \n"
2365 "mulps %%xmm4, %%xmm0 \n"
2366 "mulps %%xmm4, %%xmm2 \n"
2367 "movaps %%xmm0, (%1,%0) \n"
2368 "movaps %%xmm2, 16(%1,%0) \n"
2372 :"r"(dst+len), "r"(src+len), "m"(mul)
2376 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
2380 "movss %3, %%xmm4 \n"
2381 "shufps $0, %%xmm4, %%xmm4 \n"
2383 "cvtdq2ps (%2,%0), %%xmm0 \n"
2384 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
2385 "mulps %%xmm4, %%xmm0 \n"
2386 "mulps %%xmm4, %%xmm1 \n"
2387 "movaps %%xmm0, (%1,%0) \n"
2388 "movaps %%xmm1, 16(%1,%0) \n"
2392 :"r"(dst+len), "r"(src+len), "m"(mul)
2396 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2399 x86_reg i = (len-16)*4;
2401 "movss %3, %%xmm4 \n"
2402 "movss %4, %%xmm5 \n"
2403 "shufps $0, %%xmm4, %%xmm4 \n"
2404 "shufps $0, %%xmm5, %%xmm5 \n"
2406 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2407 "movaps 16(%2,%0), %%xmm1 \n\t"
2408 "movaps 32(%2,%0), %%xmm2 \n\t"
2409 "movaps 48(%2,%0), %%xmm3 \n\t"
2410 "maxps %%xmm4, %%xmm0 \n\t"
2411 "maxps %%xmm4, %%xmm1 \n\t"
2412 "maxps %%xmm4, %%xmm2 \n\t"
2413 "maxps %%xmm4, %%xmm3 \n\t"
2414 "minps %%xmm5, %%xmm0 \n\t"
2415 "minps %%xmm5, %%xmm1 \n\t"
2416 "minps %%xmm5, %%xmm2 \n\t"
2417 "minps %%xmm5, %%xmm3 \n\t"
2418 "movaps %%xmm0, (%1,%0) \n\t"
2419 "movaps %%xmm1, 16(%1,%0) \n\t"
2420 "movaps %%xmm2, 32(%1,%0) \n\t"
2421 "movaps %%xmm3, 48(%1,%0) \n\t"
2425 :"r"(dst), "r"(src), "m"(min), "m"(max)
2430 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2431 x86_reg reglen = len;
2432 // not bit-exact: pf2id uses different rounding than C and SSE
2435 "lea (%2,%0,2) , %2 \n\t"
2439 "pf2id (%2,%0,2) , %%mm0 \n\t"
2440 "pf2id 8(%2,%0,2) , %%mm1 \n\t"
2441 "pf2id 16(%2,%0,2) , %%mm2 \n\t"
2442 "pf2id 24(%2,%0,2) , %%mm3 \n\t"
2443 "packssdw %%mm1 , %%mm0 \n\t"
2444 "packssdw %%mm3 , %%mm2 \n\t"
2445 "movq %%mm0 , (%1,%0) \n\t"
2446 "movq %%mm2 , 8(%1,%0) \n\t"
2450 :"+r"(reglen), "+r"(dst), "+r"(src)
2453 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
2454 x86_reg reglen = len;
2457 "lea (%2,%0,2) , %2 \n\t"
2461 "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
2462 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
2463 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
2464 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
2465 "packssdw %%mm1 , %%mm0 \n\t"
2466 "packssdw %%mm3 , %%mm2 \n\t"
2467 "movq %%mm0 , (%1,%0) \n\t"
2468 "movq %%mm2 , 8(%1,%0) \n\t"
2472 :"+r"(reglen), "+r"(dst), "+r"(src)
2476 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
2477 x86_reg reglen = len;
2480 "lea (%2,%0,2) , %2 \n\t"
2484 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
2485 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
2486 "packssdw %%xmm1 , %%xmm0 \n\t"
2487 "movdqa %%xmm0 , (%1,%0) \n\t"
2490 :"+r"(reglen), "+r"(dst), "+r"(src)
2494 void ff_vp3_idct_mmx(int16_t *input_data);
2495 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2496 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2498 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2500 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2501 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2503 void ff_vp3_idct_sse2(int16_t *input_data);
2504 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2505 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2507 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2508 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2509 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2510 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2511 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2512 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2513 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2514 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2515 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2516 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2517 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2520 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
2521 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
2522 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
2524 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
2526 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
2527 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
2528 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
2529 DECLARE_ALIGNED(16, int16_t, tmp)[len];\
2531 for(c=0; c<channels; c++){\
2532 float_to_int16_##cpu(tmp, src[c], len);\
2533 for(i=0, j=c; i<len; i++, j+=channels)\
2538 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
2540 float_to_int16_##cpu(dst, src[0], len);\
2541 else if(channels==2){\
2542 x86_reg reglen = len; \
2543 const float *src0 = src[0];\
2544 const float *src1 = src[1];\
2552 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
2554 }else if(channels==6){\
2555 ff_float_to_int16_interleave6_##cpu(dst, src, len);\
2557 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
2560 FLOAT_TO_INT16_INTERLEAVE(3dnow,
2562 "pf2id (%2,%0), %%mm0 \n"
2563 "pf2id 8(%2,%0), %%mm1 \n"
2564 "pf2id (%3,%0), %%mm2 \n"
2565 "pf2id 8(%3,%0), %%mm3 \n"
2566 "packssdw %%mm1, %%mm0 \n"
2567 "packssdw %%mm3, %%mm2 \n"
2568 "movq %%mm0, %%mm1 \n"
2569 "punpcklwd %%mm2, %%mm0 \n"
2570 "punpckhwd %%mm2, %%mm1 \n"
2571 "movq %%mm0, (%1,%0)\n"
2572 "movq %%mm1, 8(%1,%0)\n"
2578 FLOAT_TO_INT16_INTERLEAVE(sse,
2580 "cvtps2pi (%2,%0), %%mm0 \n"
2581 "cvtps2pi 8(%2,%0), %%mm1 \n"
2582 "cvtps2pi (%3,%0), %%mm2 \n"
2583 "cvtps2pi 8(%3,%0), %%mm3 \n"
2584 "packssdw %%mm1, %%mm0 \n"
2585 "packssdw %%mm3, %%mm2 \n"
2586 "movq %%mm0, %%mm1 \n"
2587 "punpcklwd %%mm2, %%mm0 \n"
2588 "punpckhwd %%mm2, %%mm1 \n"
2589 "movq %%mm0, (%1,%0)\n"
2590 "movq %%mm1, 8(%1,%0)\n"
2596 FLOAT_TO_INT16_INTERLEAVE(sse2,
2598 "cvtps2dq (%2,%0), %%xmm0 \n"
2599 "cvtps2dq (%3,%0), %%xmm1 \n"
2600 "packssdw %%xmm1, %%xmm0 \n"
2601 "movhlps %%xmm0, %%xmm1 \n"
2602 "punpcklwd %%xmm1, %%xmm0 \n"
2603 "movdqa %%xmm0, (%1,%0) \n"
2608 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
2610 ff_float_to_int16_interleave6_3dn2(dst, src, len);
2612 float_to_int16_interleave_3dnow(dst, src, len, channels);
2615 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2617 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2619 int mm_flags = av_get_cpu_flags();
2621 if (avctx->dsp_mask) {
2622 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2623 mm_flags |= (avctx->dsp_mask & 0xffff);
2625 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2629 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2630 if (mm_flags & AV_CPU_FLAG_MMX)
2631 av_log(avctx, AV_LOG_INFO, " mmx");
2632 if (mm_flags & AV_CPU_FLAG_MMX2)
2633 av_log(avctx, AV_LOG_INFO, " mmx2");
2634 if (mm_flags & AV_CPU_FLAG_3DNOW)
2635 av_log(avctx, AV_LOG_INFO, " 3dnow");
2636 if (mm_flags & AV_CPU_FLAG_SSE)
2637 av_log(avctx, AV_LOG_INFO, " sse");
2638 if (mm_flags & AV_CPU_FLAG_SSE2)
2639 av_log(avctx, AV_LOG_INFO, " sse2");
2640 av_log(avctx, AV_LOG_INFO, "\n");
2643 if (mm_flags & AV_CPU_FLAG_MMX) {
2644 const int idct_algo= avctx->idct_algo;
2646 if(avctx->lowres==0){
2647 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2648 c->idct_put= ff_simple_idct_put_mmx;
2649 c->idct_add= ff_simple_idct_add_mmx;
2650 c->idct = ff_simple_idct_mmx;
2651 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2653 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2654 if(mm_flags & AV_CPU_FLAG_MMX2){
2655 c->idct_put= ff_libmpeg2mmx2_idct_put;
2656 c->idct_add= ff_libmpeg2mmx2_idct_add;
2657 c->idct = ff_mmxext_idct;
2659 c->idct_put= ff_libmpeg2mmx_idct_put;
2660 c->idct_add= ff_libmpeg2mmx_idct_add;
2661 c->idct = ff_mmx_idct;
2663 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2665 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2666 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2667 if(mm_flags & AV_CPU_FLAG_SSE2){
2668 c->idct_put= ff_vp3_idct_put_sse2;
2669 c->idct_add= ff_vp3_idct_add_sse2;
2670 c->idct = ff_vp3_idct_sse2;
2671 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2673 c->idct_put= ff_vp3_idct_put_mmx;
2674 c->idct_add= ff_vp3_idct_add_mmx;
2675 c->idct = ff_vp3_idct_mmx;
2676 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2678 }else if(idct_algo==FF_IDCT_CAVS){
2679 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2680 }else if(idct_algo==FF_IDCT_XVIDMMX){
2681 if(mm_flags & AV_CPU_FLAG_SSE2){
2682 c->idct_put= ff_idct_xvid_sse2_put;
2683 c->idct_add= ff_idct_xvid_sse2_add;
2684 c->idct = ff_idct_xvid_sse2;
2685 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2686 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2687 c->idct_put= ff_idct_xvid_mmx2_put;
2688 c->idct_add= ff_idct_xvid_mmx2_add;
2689 c->idct = ff_idct_xvid_mmx2;
2691 c->idct_put= ff_idct_xvid_mmx_put;
2692 c->idct_add= ff_idct_xvid_mmx_add;
2693 c->idct = ff_idct_xvid_mmx;
2698 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2699 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2700 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2701 c->clear_block = clear_block_mmx;
2702 c->clear_blocks = clear_blocks_mmx;
2703 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2704 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2705 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2706 c->clear_block = clear_block_sse;
2707 c->clear_blocks = clear_blocks_sse;
2710 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2711 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2712 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2713 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2714 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2716 SET_HPEL_FUNCS(put, 0, 16, mmx);
2717 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2718 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2719 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2720 SET_HPEL_FUNCS(put, 1, 8, mmx);
2721 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2722 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2723 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2725 #if ARCH_X86_32 || !HAVE_YASM
2728 #if ARCH_X86_32 && HAVE_YASM
2729 c->emulated_edge_mc = emulated_edge_mc_mmx;
2732 c->add_bytes= add_bytes_mmx;
2733 c->add_bytes_l2= add_bytes_l2_mmx;
2735 c->draw_edges = draw_edges_mmx;
2737 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2739 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2743 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2744 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2745 c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
2747 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2748 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2751 if (mm_flags & AV_CPU_FLAG_MMX2) {
2752 c->prefetch = prefetch_mmx2;
2754 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2755 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2757 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2758 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2759 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2761 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2762 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2764 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2765 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2766 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2768 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2769 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2770 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2771 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2772 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2773 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2774 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2776 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2777 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2778 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2781 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2782 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2785 if (CONFIG_VP3_DECODER
2786 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2787 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2788 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2791 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2792 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2793 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2794 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2795 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2796 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2797 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2798 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2799 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2800 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2801 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2802 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2803 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2804 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2805 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2806 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2807 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2809 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2810 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2811 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2812 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2813 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2814 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2816 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2817 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2818 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2819 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2820 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2821 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2823 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2824 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2825 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2826 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2829 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2830 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2832 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
2834 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2835 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2836 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2837 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2839 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2841 #if HAVE_7REGS && HAVE_TEN_OPERANDS
2842 if( mm_flags&AV_CPU_FLAG_3DNOW )
2843 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2846 if (CONFIG_VC1_DECODER)
2847 ff_vc1dsp_init_mmx(c, avctx);
2849 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2850 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2851 c->prefetch = prefetch_3dnow;
2853 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2854 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2856 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2857 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2858 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2860 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2861 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2863 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2864 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2865 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2867 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2868 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2869 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2870 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2871 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2872 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2873 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2876 if (CONFIG_VP3_DECODER
2877 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2878 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2879 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2882 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2883 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2884 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2885 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2886 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2887 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2889 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2890 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2891 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2892 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2893 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2894 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2896 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2897 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2898 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2899 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2902 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2903 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2905 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
2907 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2908 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2913 #define H264_QPEL_FUNCS(x, y, CPU)\
2914 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2915 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2916 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2917 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2918 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2919 // these functions are slower than mmx on AMD, but faster on Intel
2920 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2921 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2922 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2923 H264_QPEL_FUNCS(0, 0, sse2);
2925 if(mm_flags & AV_CPU_FLAG_SSE2){
2926 H264_QPEL_FUNCS(0, 1, sse2);
2927 H264_QPEL_FUNCS(0, 2, sse2);
2928 H264_QPEL_FUNCS(0, 3, sse2);
2929 H264_QPEL_FUNCS(1, 1, sse2);
2930 H264_QPEL_FUNCS(1, 2, sse2);
2931 H264_QPEL_FUNCS(1, 3, sse2);
2932 H264_QPEL_FUNCS(2, 1, sse2);
2933 H264_QPEL_FUNCS(2, 2, sse2);
2934 H264_QPEL_FUNCS(2, 3, sse2);
2935 H264_QPEL_FUNCS(3, 1, sse2);
2936 H264_QPEL_FUNCS(3, 2, sse2);
2937 H264_QPEL_FUNCS(3, 3, sse2);
2940 if(mm_flags & AV_CPU_FLAG_SSSE3){
2941 H264_QPEL_FUNCS(1, 0, ssse3);
2942 H264_QPEL_FUNCS(1, 1, ssse3);
2943 H264_QPEL_FUNCS(1, 2, ssse3);
2944 H264_QPEL_FUNCS(1, 3, ssse3);
2945 H264_QPEL_FUNCS(2, 0, ssse3);
2946 H264_QPEL_FUNCS(2, 1, ssse3);
2947 H264_QPEL_FUNCS(2, 2, ssse3);
2948 H264_QPEL_FUNCS(2, 3, ssse3);
2949 H264_QPEL_FUNCS(3, 0, ssse3);
2950 H264_QPEL_FUNCS(3, 1, ssse3);
2951 H264_QPEL_FUNCS(3, 2, ssse3);
2952 H264_QPEL_FUNCS(3, 3, ssse3);
2953 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2955 c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
2956 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
2957 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2958 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2959 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2960 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2961 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2962 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2963 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2968 if(mm_flags & AV_CPU_FLAG_3DNOW){
2969 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2970 c->vector_fmul = vector_fmul_3dnow;
2971 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2972 c->float_to_int16 = float_to_int16_3dnow;
2973 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
2976 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2977 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2979 c->vector_fmul_window = vector_fmul_window_3dnow2;
2981 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2982 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
2985 if(mm_flags & AV_CPU_FLAG_MMX2){
2987 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2988 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2991 if(mm_flags & AV_CPU_FLAG_SSE){
2992 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2993 c->ac3_downmix = ac3_downmix_sse;
2994 c->vector_fmul = vector_fmul_sse;
2995 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2996 c->vector_fmul_add = vector_fmul_add_sse;
2998 c->vector_fmul_window = vector_fmul_window_sse;
3000 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
3001 c->vector_clipf = vector_clipf_sse;
3002 c->float_to_int16 = float_to_int16_sse;
3003 c->float_to_int16_interleave = float_to_int16_interleave_sse;
3005 c->scalarproduct_float = ff_scalarproduct_float_sse;
3008 if(mm_flags & AV_CPU_FLAG_3DNOW)
3009 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
3010 if(mm_flags & AV_CPU_FLAG_SSE2){
3011 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3012 c->float_to_int16 = float_to_int16_sse2;
3013 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3015 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3016 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
3018 c->emulated_edge_mc = emulated_edge_mc_sse;
3022 if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
3023 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
3026 if (CONFIG_ENCODERS)
3027 dsputilenc_init_mmx(c, avctx);
3030 // for speed testing
3031 get_pixels = just_return;
3032 put_pixels_clamped = just_return;
3033 add_pixels_clamped = just_return;
3035 pix_abs16x16 = just_return;
3036 pix_abs16x16_x2 = just_return;
3037 pix_abs16x16_y2 = just_return;
3038 pix_abs16x16_xy2 = just_return;
3040 put_pixels_tab[0] = just_return;
3041 put_pixels_tab[1] = just_return;
3042 put_pixels_tab[2] = just_return;
3043 put_pixels_tab[3] = just_return;
3045 put_no_rnd_pixels_tab[0] = just_return;
3046 put_no_rnd_pixels_tab[1] = just_return;
3047 put_no_rnd_pixels_tab[2] = just_return;
3048 put_no_rnd_pixels_tab[3] = just_return;
3050 avg_pixels_tab[0] = just_return;
3051 avg_pixels_tab[1] = just_return;
3052 avg_pixels_tab[2] = just_return;
3053 avg_pixels_tab[3] = just_return;
3055 avg_no_rnd_pixels_tab[0] = just_return;
3056 avg_no_rnd_pixels_tab[1] = just_return;
3057 avg_no_rnd_pixels_tab[2] = just_return;
3058 avg_no_rnd_pixels_tab[3] = just_return;
3060 //av_fdct = just_return;
3061 //ff_idct = just_return;