2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
84 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
85 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
87 #define MOVQ_BFE(regd) \
89 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
90 "paddb %%" #regd ", %%" #regd " \n\t" ::)
93 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
94 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
96 // for shared library it's better to use this way for accessing constants
98 #define MOVQ_BONE(regd) \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
104 #define MOVQ_WTWO(regd) \
106 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
107 "psrlw $15, %%" #regd " \n\t" \
108 "psllw $1, %%" #regd " \n\t"::)
112 // using regr as temporary and for the output result
113 // first argument is unmodifed and second is trashed
114 // regfe is supposed to contain 0xfefefefefefefefe
115 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
116 "movq " #rega ", " #regr " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pxor " #rega ", " #regb " \n\t"\
119 "pand " #regfe "," #regb " \n\t"\
120 "psrlq $1, " #regb " \n\t"\
121 "paddb " #regb ", " #regr " \n\t"
123 #define PAVGB_MMX(rega, regb, regr, regfe) \
124 "movq " #rega ", " #regr " \n\t"\
125 "por " #regb ", " #regr " \n\t"\
126 "pxor " #rega ", " #regb " \n\t"\
127 "pand " #regfe "," #regb " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"
131 // mm6 is supposed to contain 0xfefefefefefefefe
132 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
133 "movq " #rega ", " #regr " \n\t"\
134 "movq " #regc ", " #regp " \n\t"\
135 "pand " #regb ", " #regr " \n\t"\
136 "pand " #regd ", " #regp " \n\t"\
137 "pxor " #rega ", " #regb " \n\t"\
138 "pxor " #regc ", " #regd " \n\t"\
139 "pand %%mm6, " #regb " \n\t"\
140 "pand %%mm6, " #regd " \n\t"\
141 "psrlq $1, " #regb " \n\t"\
142 "psrlq $1, " #regd " \n\t"\
143 "paddb " #regb ", " #regr " \n\t"\
144 "paddb " #regd ", " #regp " \n\t"
146 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
147 "movq " #rega ", " #regr " \n\t"\
148 "movq " #regc ", " #regp " \n\t"\
149 "por " #regb ", " #regr " \n\t"\
150 "por " #regd ", " #regp " \n\t"\
151 "pxor " #rega ", " #regb " \n\t"\
152 "pxor " #regc ", " #regd " \n\t"\
153 "pand %%mm6, " #regb " \n\t"\
154 "pand %%mm6, " #regd " \n\t"\
155 "psrlq $1, " #regd " \n\t"\
156 "psrlq $1, " #regb " \n\t"\
157 "psubb " #regb ", " #regr " \n\t"\
158 "psubb " #regd ", " #regp " \n\t"
160 /***********************************/
161 /* MMX no rounding */
162 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
163 #define SET_RND MOVQ_WONE
164 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
165 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
166 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
168 #include "dsputil_mmx_rnd_template.c"
174 /***********************************/
177 #define DEF(x, y) x ## _ ## y ##_mmx
178 #define SET_RND MOVQ_WTWO
179 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
180 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
182 #include "dsputil_mmx_rnd_template.c"
190 /***********************************/
193 #define DEF(x) x ## _3dnow
194 #define PAVGB "pavgusb"
197 #include "dsputil_mmx_avg_template.c"
203 /***********************************/
206 #define DEF(x) x ## _mmx2
208 /* Introduced only in MMX2 set */
209 #define PAVGB "pavgb"
212 #include "dsputil_mmx_avg_template.c"
218 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
219 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
220 #define put_pixels16_mmx2 put_pixels16_mmx
221 #define put_pixels8_mmx2 put_pixels8_mmx
222 #define put_pixels4_mmx2 put_pixels4_mmx
223 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
224 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
225 #define put_pixels16_3dnow put_pixels16_mmx
226 #define put_pixels8_3dnow put_pixels8_mmx
227 #define put_pixels4_3dnow put_pixels4_mmx
228 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
229 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
231 /***********************************/
234 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
239 /* read the pixels */
244 "movq %3, %%mm0 \n\t"
245 "movq 8%3, %%mm1 \n\t"
246 "movq 16%3, %%mm2 \n\t"
247 "movq 24%3, %%mm3 \n\t"
248 "movq 32%3, %%mm4 \n\t"
249 "movq 40%3, %%mm5 \n\t"
250 "movq 48%3, %%mm6 \n\t"
251 "movq 56%3, %%mm7 \n\t"
252 "packuswb %%mm1, %%mm0 \n\t"
253 "packuswb %%mm3, %%mm2 \n\t"
254 "packuswb %%mm5, %%mm4 \n\t"
255 "packuswb %%mm7, %%mm6 \n\t"
256 "movq %%mm0, (%0) \n\t"
257 "movq %%mm2, (%0, %1) \n\t"
258 "movq %%mm4, (%0, %1, 2) \n\t"
259 "movq %%mm6, (%0, %2) \n\t"
260 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
265 // if here would be an exact copy of the code above
266 // compiler would generate some very strange code
269 "movq (%3), %%mm0 \n\t"
270 "movq 8(%3), %%mm1 \n\t"
271 "movq 16(%3), %%mm2 \n\t"
272 "movq 24(%3), %%mm3 \n\t"
273 "movq 32(%3), %%mm4 \n\t"
274 "movq 40(%3), %%mm5 \n\t"
275 "movq 48(%3), %%mm6 \n\t"
276 "movq 56(%3), %%mm7 \n\t"
277 "packuswb %%mm1, %%mm0 \n\t"
278 "packuswb %%mm3, %%mm2 \n\t"
279 "packuswb %%mm5, %%mm4 \n\t"
280 "packuswb %%mm7, %%mm6 \n\t"
281 "movq %%mm0, (%0) \n\t"
282 "movq %%mm2, (%0, %1) \n\t"
283 "movq %%mm4, (%0, %1, 2) \n\t"
284 "movq %%mm6, (%0, %2) \n\t"
285 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
289 #define put_signed_pixels_clamped_mmx_half(off) \
290 "movq "#off"(%2), %%mm1 \n\t"\
291 "movq 16+"#off"(%2), %%mm2 \n\t"\
292 "movq 32+"#off"(%2), %%mm3 \n\t"\
293 "movq 48+"#off"(%2), %%mm4 \n\t"\
294 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
295 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
296 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
297 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
298 "paddb %%mm0, %%mm1 \n\t"\
299 "paddb %%mm0, %%mm2 \n\t"\
300 "paddb %%mm0, %%mm3 \n\t"\
301 "paddb %%mm0, %%mm4 \n\t"\
302 "movq %%mm1, (%0) \n\t"\
303 "movq %%mm2, (%0, %3) \n\t"\
304 "movq %%mm3, (%0, %3, 2) \n\t"\
305 "movq %%mm4, (%0, %1) \n\t"
307 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
309 x86_reg line_skip = line_size;
313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
314 "lea (%3, %3, 2), %1 \n\t"
315 put_signed_pixels_clamped_mmx_half(0)
316 "lea (%0, %3, 4), %0 \n\t"
317 put_signed_pixels_clamped_mmx_half(64)
318 :"+&r" (pixels), "=&r" (line_skip3)
319 :"r" (block), "r"(line_skip)
323 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
329 /* read the pixels */
336 "movq (%2), %%mm0 \n\t"
337 "movq 8(%2), %%mm1 \n\t"
338 "movq 16(%2), %%mm2 \n\t"
339 "movq 24(%2), %%mm3 \n\t"
340 "movq %0, %%mm4 \n\t"
341 "movq %1, %%mm6 \n\t"
342 "movq %%mm4, %%mm5 \n\t"
343 "punpcklbw %%mm7, %%mm4 \n\t"
344 "punpckhbw %%mm7, %%mm5 \n\t"
345 "paddsw %%mm4, %%mm0 \n\t"
346 "paddsw %%mm5, %%mm1 \n\t"
347 "movq %%mm6, %%mm5 \n\t"
348 "punpcklbw %%mm7, %%mm6 \n\t"
349 "punpckhbw %%mm7, %%mm5 \n\t"
350 "paddsw %%mm6, %%mm2 \n\t"
351 "paddsw %%mm5, %%mm3 \n\t"
352 "packuswb %%mm1, %%mm0 \n\t"
353 "packuswb %%mm3, %%mm2 \n\t"
354 "movq %%mm0, %0 \n\t"
355 "movq %%mm2, %1 \n\t"
356 :"+m"(*pix), "+m"(*(pix+line_size))
364 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
367 "lea (%3, %3), %%"REG_a" \n\t"
370 "movd (%1), %%mm0 \n\t"
371 "movd (%1, %3), %%mm1 \n\t"
372 "movd %%mm0, (%2) \n\t"
373 "movd %%mm1, (%2, %3) \n\t"
374 "add %%"REG_a", %1 \n\t"
375 "add %%"REG_a", %2 \n\t"
376 "movd (%1), %%mm0 \n\t"
377 "movd (%1, %3), %%mm1 \n\t"
378 "movd %%mm0, (%2) \n\t"
379 "movd %%mm1, (%2, %3) \n\t"
380 "add %%"REG_a", %1 \n\t"
381 "add %%"REG_a", %2 \n\t"
384 : "+g"(h), "+r" (pixels), "+r" (block)
385 : "r"((x86_reg)line_size)
390 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
393 "lea (%3, %3), %%"REG_a" \n\t"
396 "movq (%1), %%mm0 \n\t"
397 "movq (%1, %3), %%mm1 \n\t"
398 "movq %%mm0, (%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "add %%"REG_a", %1 \n\t"
401 "add %%"REG_a", %2 \n\t"
402 "movq (%1), %%mm0 \n\t"
403 "movq (%1, %3), %%mm1 \n\t"
404 "movq %%mm0, (%2) \n\t"
405 "movq %%mm1, (%2, %3) \n\t"
406 "add %%"REG_a", %1 \n\t"
407 "add %%"REG_a", %2 \n\t"
410 : "+g"(h), "+r" (pixels), "+r" (block)
411 : "r"((x86_reg)line_size)
416 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
419 "lea (%3, %3), %%"REG_a" \n\t"
422 "movq (%1), %%mm0 \n\t"
423 "movq 8(%1), %%mm4 \n\t"
424 "movq (%1, %3), %%mm1 \n\t"
425 "movq 8(%1, %3), %%mm5 \n\t"
426 "movq %%mm0, (%2) \n\t"
427 "movq %%mm4, 8(%2) \n\t"
428 "movq %%mm1, (%2, %3) \n\t"
429 "movq %%mm5, 8(%2, %3) \n\t"
430 "add %%"REG_a", %1 \n\t"
431 "add %%"REG_a", %2 \n\t"
432 "movq (%1), %%mm0 \n\t"
433 "movq 8(%1), %%mm4 \n\t"
434 "movq (%1, %3), %%mm1 \n\t"
435 "movq 8(%1, %3), %%mm5 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm4, 8(%2) \n\t"
438 "movq %%mm1, (%2, %3) \n\t"
439 "movq %%mm5, 8(%2, %3) \n\t"
440 "add %%"REG_a", %1 \n\t"
441 "add %%"REG_a", %2 \n\t"
444 : "+g"(h), "+r" (pixels), "+r" (block)
445 : "r"((x86_reg)line_size)
450 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
454 "movdqu (%1), %%xmm0 \n\t"
455 "movdqu (%1,%3), %%xmm1 \n\t"
456 "movdqu (%1,%3,2), %%xmm2 \n\t"
457 "movdqu (%1,%4), %%xmm3 \n\t"
458 "movdqa %%xmm0, (%2) \n\t"
459 "movdqa %%xmm1, (%2,%3) \n\t"
460 "movdqa %%xmm2, (%2,%3,2) \n\t"
461 "movdqa %%xmm3, (%2,%4) \n\t"
463 "lea (%1,%3,4), %1 \n\t"
464 "lea (%2,%3,4), %2 \n\t"
466 : "+g"(h), "+r" (pixels), "+r" (block)
467 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
472 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
476 "movdqu (%1), %%xmm0 \n\t"
477 "movdqu (%1,%3), %%xmm1 \n\t"
478 "movdqu (%1,%3,2), %%xmm2 \n\t"
479 "movdqu (%1,%4), %%xmm3 \n\t"
480 "pavgb (%2), %%xmm0 \n\t"
481 "pavgb (%2,%3), %%xmm1 \n\t"
482 "pavgb (%2,%3,2), %%xmm2 \n\t"
483 "pavgb (%2,%4), %%xmm3 \n\t"
484 "movdqa %%xmm0, (%2) \n\t"
485 "movdqa %%xmm1, (%2,%3) \n\t"
486 "movdqa %%xmm2, (%2,%3,2) \n\t"
487 "movdqa %%xmm3, (%2,%4) \n\t"
489 "lea (%1,%3,4), %1 \n\t"
490 "lea (%2,%3,4), %2 \n\t"
492 : "+g"(h), "+r" (pixels), "+r" (block)
493 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
498 #define CLEAR_BLOCKS(name,n) \
499 static void name(DCTELEM *blocks)\
502 "pxor %%mm7, %%mm7 \n\t"\
503 "mov %1, %%"REG_a" \n\t"\
505 "movq %%mm7, (%0, %%"REG_a") \n\t"\
506 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
507 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
508 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
509 "add $32, %%"REG_a" \n\t"\
511 : : "r" (((uint8_t *)blocks)+128*n),\
516 CLEAR_BLOCKS(clear_blocks_mmx, 6)
517 CLEAR_BLOCKS(clear_block_mmx, 1)
519 static void clear_block_sse(DCTELEM *block)
522 "xorps %%xmm0, %%xmm0 \n"
523 "movaps %%xmm0, (%0) \n"
524 "movaps %%xmm0, 16(%0) \n"
525 "movaps %%xmm0, 32(%0) \n"
526 "movaps %%xmm0, 48(%0) \n"
527 "movaps %%xmm0, 64(%0) \n"
528 "movaps %%xmm0, 80(%0) \n"
529 "movaps %%xmm0, 96(%0) \n"
530 "movaps %%xmm0, 112(%0) \n"
536 static void clear_blocks_sse(DCTELEM *blocks)
539 "xorps %%xmm0, %%xmm0 \n"
540 "mov %1, %%"REG_a" \n"
542 "movaps %%xmm0, (%0, %%"REG_a") \n"
543 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
544 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
550 "add $128, %%"REG_a" \n"
552 : : "r" (((uint8_t *)blocks)+128*6),
558 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
563 "movq (%1, %0), %%mm0 \n\t"
564 "movq (%2, %0), %%mm1 \n\t"
565 "paddb %%mm0, %%mm1 \n\t"
566 "movq %%mm1, (%2, %0) \n\t"
567 "movq 8(%1, %0), %%mm0 \n\t"
568 "movq 8(%2, %0), %%mm1 \n\t"
569 "paddb %%mm0, %%mm1 \n\t"
570 "movq %%mm1, 8(%2, %0) \n\t"
576 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
579 dst[i+0] += src[i+0];
582 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
587 "movq (%2, %0), %%mm0 \n\t"
588 "movq 8(%2, %0), %%mm1 \n\t"
589 "paddb (%3, %0), %%mm0 \n\t"
590 "paddb 8(%3, %0), %%mm1 \n\t"
591 "movq %%mm0, (%1, %0) \n\t"
592 "movq %%mm1, 8(%1, %0) \n\t"
598 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
601 dst[i] = src1[i] + src2[i];
604 #if HAVE_7REGS && HAVE_TEN_OPERANDS
605 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
608 int l = *left & 0xff;
609 int tl = *left_top & 0xff;
614 "movzbl (%3,%4), %2 \n"
627 "add (%6,%4), %b0 \n"
628 "mov %b0, (%5,%4) \n"
631 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
632 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
639 #define H263_LOOP_FILTER \
640 "pxor %%mm7, %%mm7 \n\t"\
641 "movq %0, %%mm0 \n\t"\
642 "movq %0, %%mm1 \n\t"\
643 "movq %3, %%mm2 \n\t"\
644 "movq %3, %%mm3 \n\t"\
645 "punpcklbw %%mm7, %%mm0 \n\t"\
646 "punpckhbw %%mm7, %%mm1 \n\t"\
647 "punpcklbw %%mm7, %%mm2 \n\t"\
648 "punpckhbw %%mm7, %%mm3 \n\t"\
649 "psubw %%mm2, %%mm0 \n\t"\
650 "psubw %%mm3, %%mm1 \n\t"\
651 "movq %1, %%mm2 \n\t"\
652 "movq %1, %%mm3 \n\t"\
653 "movq %2, %%mm4 \n\t"\
654 "movq %2, %%mm5 \n\t"\
655 "punpcklbw %%mm7, %%mm2 \n\t"\
656 "punpckhbw %%mm7, %%mm3 \n\t"\
657 "punpcklbw %%mm7, %%mm4 \n\t"\
658 "punpckhbw %%mm7, %%mm5 \n\t"\
659 "psubw %%mm2, %%mm4 \n\t"\
660 "psubw %%mm3, %%mm5 \n\t"\
661 "psllw $2, %%mm4 \n\t"\
662 "psllw $2, %%mm5 \n\t"\
663 "paddw %%mm0, %%mm4 \n\t"\
664 "paddw %%mm1, %%mm5 \n\t"\
665 "pxor %%mm6, %%mm6 \n\t"\
666 "pcmpgtw %%mm4, %%mm6 \n\t"\
667 "pcmpgtw %%mm5, %%mm7 \n\t"\
668 "pxor %%mm6, %%mm4 \n\t"\
669 "pxor %%mm7, %%mm5 \n\t"\
670 "psubw %%mm6, %%mm4 \n\t"\
671 "psubw %%mm7, %%mm5 \n\t"\
672 "psrlw $3, %%mm4 \n\t"\
673 "psrlw $3, %%mm5 \n\t"\
674 "packuswb %%mm5, %%mm4 \n\t"\
675 "packsswb %%mm7, %%mm6 \n\t"\
676 "pxor %%mm7, %%mm7 \n\t"\
677 "movd %4, %%mm2 \n\t"\
678 "punpcklbw %%mm2, %%mm2 \n\t"\
679 "punpcklbw %%mm2, %%mm2 \n\t"\
680 "punpcklbw %%mm2, %%mm2 \n\t"\
681 "psubusb %%mm4, %%mm2 \n\t"\
682 "movq %%mm2, %%mm3 \n\t"\
683 "psubusb %%mm4, %%mm3 \n\t"\
684 "psubb %%mm3, %%mm2 \n\t"\
685 "movq %1, %%mm3 \n\t"\
686 "movq %2, %%mm4 \n\t"\
687 "pxor %%mm6, %%mm3 \n\t"\
688 "pxor %%mm6, %%mm4 \n\t"\
689 "paddusb %%mm2, %%mm3 \n\t"\
690 "psubusb %%mm2, %%mm4 \n\t"\
691 "pxor %%mm6, %%mm3 \n\t"\
692 "pxor %%mm6, %%mm4 \n\t"\
693 "paddusb %%mm2, %%mm2 \n\t"\
694 "packsswb %%mm1, %%mm0 \n\t"\
695 "pcmpgtb %%mm0, %%mm7 \n\t"\
696 "pxor %%mm7, %%mm0 \n\t"\
697 "psubb %%mm7, %%mm0 \n\t"\
698 "movq %%mm0, %%mm1 \n\t"\
699 "psubusb %%mm2, %%mm0 \n\t"\
700 "psubb %%mm0, %%mm1 \n\t"\
701 "pand %5, %%mm1 \n\t"\
702 "psrlw $2, %%mm1 \n\t"\
703 "pxor %%mm7, %%mm1 \n\t"\
704 "psubb %%mm7, %%mm1 \n\t"\
705 "movq %0, %%mm5 \n\t"\
706 "movq %3, %%mm6 \n\t"\
707 "psubb %%mm1, %%mm5 \n\t"\
708 "paddb %%mm1, %%mm6 \n\t"
710 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
711 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
712 const int strength= ff_h263_loop_filter_strength[qscale];
718 "movq %%mm3, %1 \n\t"
719 "movq %%mm4, %2 \n\t"
720 "movq %%mm5, %0 \n\t"
721 "movq %%mm6, %3 \n\t"
722 : "+m" (*(uint64_t*)(src - 2*stride)),
723 "+m" (*(uint64_t*)(src - 1*stride)),
724 "+m" (*(uint64_t*)(src + 0*stride)),
725 "+m" (*(uint64_t*)(src + 1*stride))
726 : "g" (2*strength), "m"(ff_pb_FC)
731 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
732 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
733 const int strength= ff_h263_loop_filter_strength[qscale];
734 DECLARE_ALIGNED(8, uint64_t, temp)[4];
735 uint8_t *btemp= (uint8_t*)temp;
739 transpose4x4(btemp , src , 8, stride);
740 transpose4x4(btemp+4, src + 4*stride, 8, stride);
742 H263_LOOP_FILTER // 5 3 4 6
748 : "g" (2*strength), "m"(ff_pb_FC)
752 "movq %%mm5, %%mm1 \n\t"
753 "movq %%mm4, %%mm0 \n\t"
754 "punpcklbw %%mm3, %%mm5 \n\t"
755 "punpcklbw %%mm6, %%mm4 \n\t"
756 "punpckhbw %%mm3, %%mm1 \n\t"
757 "punpckhbw %%mm6, %%mm0 \n\t"
758 "movq %%mm5, %%mm3 \n\t"
759 "movq %%mm1, %%mm6 \n\t"
760 "punpcklwd %%mm4, %%mm5 \n\t"
761 "punpcklwd %%mm0, %%mm1 \n\t"
762 "punpckhwd %%mm4, %%mm3 \n\t"
763 "punpckhwd %%mm0, %%mm6 \n\t"
764 "movd %%mm5, (%0) \n\t"
765 "punpckhdq %%mm5, %%mm5 \n\t"
766 "movd %%mm5, (%0,%2) \n\t"
767 "movd %%mm3, (%0,%2,2) \n\t"
768 "punpckhdq %%mm3, %%mm3 \n\t"
769 "movd %%mm3, (%0,%3) \n\t"
770 "movd %%mm1, (%1) \n\t"
771 "punpckhdq %%mm1, %%mm1 \n\t"
772 "movd %%mm1, (%1,%2) \n\t"
773 "movd %%mm6, (%1,%2,2) \n\t"
774 "punpckhdq %%mm6, %%mm6 \n\t"
775 "movd %%mm6, (%1,%3) \n\t"
777 "r" (src + 4*stride),
778 "r" ((x86_reg) stride ),
779 "r" ((x86_reg)(3*stride))
784 /* draw the edges of width 'w' of an image of size width, height
785 this mmx version can only handle w==8 || w==16 */
786 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
788 uint8_t *ptr, *last_line;
791 last_line = buf + (height - 1) * wrap;
798 "movd (%0), %%mm0 \n\t"
799 "punpcklbw %%mm0, %%mm0 \n\t"
800 "punpcklwd %%mm0, %%mm0 \n\t"
801 "punpckldq %%mm0, %%mm0 \n\t"
802 "movq %%mm0, -8(%0) \n\t"
803 "movq -8(%0, %2), %%mm1 \n\t"
804 "punpckhbw %%mm1, %%mm1 \n\t"
805 "punpckhwd %%mm1, %%mm1 \n\t"
806 "punpckhdq %%mm1, %%mm1 \n\t"
807 "movq %%mm1, (%0, %2) \n\t"
812 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
819 "movd (%0), %%mm0 \n\t"
820 "punpcklbw %%mm0, %%mm0 \n\t"
821 "punpcklwd %%mm0, %%mm0 \n\t"
822 "punpckldq %%mm0, %%mm0 \n\t"
823 "movq %%mm0, -8(%0) \n\t"
824 "movq %%mm0, -16(%0) \n\t"
825 "movq -8(%0, %2), %%mm1 \n\t"
826 "punpckhbw %%mm1, %%mm1 \n\t"
827 "punpckhwd %%mm1, %%mm1 \n\t"
828 "punpckhdq %%mm1, %%mm1 \n\t"
829 "movq %%mm1, (%0, %2) \n\t"
830 "movq %%mm1, 8(%0, %2) \n\t"
835 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
840 /* top and bottom (and hopefully also the corners) */
841 ptr= buf - (i + 1) * wrap - w;
844 "movq (%1, %0), %%mm0 \n\t"
845 "movq %%mm0, (%0) \n\t"
846 "movq %%mm0, (%0, %2) \n\t"
847 "movq %%mm0, (%0, %2, 2) \n\t"
848 "movq %%mm0, (%0, %3) \n\t"
853 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
855 ptr= last_line + (i + 1) * wrap - w;
858 "movq (%1, %0), %%mm0 \n\t"
859 "movq %%mm0, (%0) \n\t"
860 "movq %%mm0, (%0, %2) \n\t"
861 "movq %%mm0, (%0, %2, 2) \n\t"
862 "movq %%mm0, (%0, %3) \n\t"
867 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
872 #define PAETH(cpu, abs3)\
873 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
878 "pxor %%mm7, %%mm7 \n"\
879 "movd (%1,%0), %%mm0 \n"\
880 "movd (%2,%0), %%mm1 \n"\
881 "punpcklbw %%mm7, %%mm0 \n"\
882 "punpcklbw %%mm7, %%mm1 \n"\
885 "movq %%mm1, %%mm2 \n"\
886 "movd (%2,%0), %%mm1 \n"\
887 "movq %%mm2, %%mm3 \n"\
888 "punpcklbw %%mm7, %%mm1 \n"\
889 "movq %%mm2, %%mm4 \n"\
890 "psubw %%mm1, %%mm3 \n"\
891 "psubw %%mm0, %%mm4 \n"\
892 "movq %%mm3, %%mm5 \n"\
893 "paddw %%mm4, %%mm5 \n"\
895 "movq %%mm4, %%mm6 \n"\
896 "pminsw %%mm5, %%mm6 \n"\
897 "pcmpgtw %%mm6, %%mm3 \n"\
898 "pcmpgtw %%mm5, %%mm4 \n"\
899 "movq %%mm4, %%mm6 \n"\
900 "pand %%mm3, %%mm4 \n"\
901 "pandn %%mm3, %%mm6 \n"\
902 "pandn %%mm0, %%mm3 \n"\
903 "movd (%3,%0), %%mm0 \n"\
904 "pand %%mm1, %%mm6 \n"\
905 "pand %%mm4, %%mm2 \n"\
906 "punpcklbw %%mm7, %%mm0 \n"\
908 "paddw %%mm6, %%mm0 \n"\
909 "paddw %%mm2, %%mm3 \n"\
910 "paddw %%mm3, %%mm0 \n"\
911 "pand %%mm5, %%mm0 \n"\
912 "movq %%mm0, %%mm3 \n"\
913 "packuswb %%mm3, %%mm3 \n"\
914 "movd %%mm3, (%1,%0) \n"\
919 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
926 "psubw %%mm5, %%mm7 \n"\
927 "pmaxsw %%mm7, %%mm5 \n"\
928 "pxor %%mm6, %%mm6 \n"\
929 "pxor %%mm7, %%mm7 \n"\
930 "psubw %%mm3, %%mm6 \n"\
931 "psubw %%mm4, %%mm7 \n"\
932 "pmaxsw %%mm6, %%mm3 \n"\
933 "pmaxsw %%mm7, %%mm4 \n"\
934 "pxor %%mm7, %%mm7 \n"
937 "pabsw %%mm3, %%mm3 \n"\
938 "pabsw %%mm4, %%mm4 \n"\
939 "pabsw %%mm5, %%mm5 \n"
941 PAETH(mmx2, ABS3_MMX2)
943 PAETH(ssse3, ABS3_SSSE3)
946 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
947 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
948 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
949 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
950 "movq "#in7", " #m3 " \n\t" /* d */\
951 "movq "#in0", %%mm5 \n\t" /* D */\
952 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
953 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
954 "movq "#in1", %%mm5 \n\t" /* C */\
955 "movq "#in2", %%mm6 \n\t" /* B */\
956 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
957 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
958 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
959 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
960 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
961 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
962 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
963 "psraw $5, %%mm5 \n\t"\
964 "packuswb %%mm5, %%mm5 \n\t"\
965 OP(%%mm5, out, %%mm7, d)
967 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
968 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
972 "pxor %%mm7, %%mm7 \n\t"\
974 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
975 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
976 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
977 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
978 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
979 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
980 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
981 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
982 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
983 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
984 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
985 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
986 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
987 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
988 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
989 "paddw %%mm3, %%mm5 \n\t" /* b */\
990 "paddw %%mm2, %%mm6 \n\t" /* c */\
991 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
992 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
993 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
994 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
995 "paddw %%mm4, %%mm0 \n\t" /* a */\
996 "paddw %%mm1, %%mm5 \n\t" /* d */\
997 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
998 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
999 "paddw %6, %%mm6 \n\t"\
1000 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1001 "psraw $5, %%mm0 \n\t"\
1002 "movq %%mm0, %5 \n\t"\
1003 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1005 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1006 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1007 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1008 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1009 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1010 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1011 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1012 "paddw %%mm0, %%mm2 \n\t" /* b */\
1013 "paddw %%mm5, %%mm3 \n\t" /* c */\
1014 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1015 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1016 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1017 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1018 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1019 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1020 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1021 "paddw %%mm2, %%mm1 \n\t" /* a */\
1022 "paddw %%mm6, %%mm4 \n\t" /* d */\
1023 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1024 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1025 "paddw %6, %%mm1 \n\t"\
1026 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1027 "psraw $5, %%mm3 \n\t"\
1028 "movq %5, %%mm1 \n\t"\
1029 "packuswb %%mm3, %%mm1 \n\t"\
1030 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1031 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1033 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1034 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1035 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1036 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1037 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1038 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1039 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1040 "paddw %%mm1, %%mm5 \n\t" /* b */\
1041 "paddw %%mm4, %%mm0 \n\t" /* c */\
1042 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1043 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1044 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1045 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1046 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1047 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1048 "paddw %%mm3, %%mm2 \n\t" /* d */\
1049 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1050 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1051 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1052 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1053 "paddw %%mm2, %%mm6 \n\t" /* a */\
1054 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1055 "paddw %6, %%mm0 \n\t"\
1056 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1057 "psraw $5, %%mm0 \n\t"\
1058 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1060 "paddw %%mm5, %%mm3 \n\t" /* a */\
1061 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1062 "paddw %%mm4, %%mm6 \n\t" /* b */\
1063 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1064 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1065 "paddw %%mm1, %%mm4 \n\t" /* c */\
1066 "paddw %%mm2, %%mm5 \n\t" /* d */\
1067 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1068 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1069 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1070 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1071 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1072 "paddw %6, %%mm4 \n\t"\
1073 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1074 "psraw $5, %%mm4 \n\t"\
1075 "packuswb %%mm4, %%mm0 \n\t"\
1076 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1082 : "+a"(src), "+c"(dst), "+D"(h)\
1083 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1088 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1091 /* quick HACK, XXX FIXME MUST be optimized */\
1094 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1095 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1096 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1097 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1098 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1099 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1100 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1101 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1102 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1103 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1104 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1105 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1106 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1107 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1108 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1109 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1111 "movq (%0), %%mm0 \n\t"\
1112 "movq 8(%0), %%mm1 \n\t"\
1113 "paddw %2, %%mm0 \n\t"\
1114 "paddw %2, %%mm1 \n\t"\
1115 "psraw $5, %%mm0 \n\t"\
1116 "psraw $5, %%mm1 \n\t"\
1117 "packuswb %%mm1, %%mm0 \n\t"\
1118 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1119 "movq 16(%0), %%mm0 \n\t"\
1120 "movq 24(%0), %%mm1 \n\t"\
1121 "paddw %2, %%mm0 \n\t"\
1122 "paddw %2, %%mm1 \n\t"\
1123 "psraw $5, %%mm0 \n\t"\
1124 "psraw $5, %%mm1 \n\t"\
1125 "packuswb %%mm1, %%mm0 \n\t"\
1126 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1127 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1135 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1137 "pxor %%mm7, %%mm7 \n\t"\
1139 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1140 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1141 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1142 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1143 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1144 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1145 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1146 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1147 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1148 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1149 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1150 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1151 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1152 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1153 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1154 "paddw %%mm3, %%mm5 \n\t" /* b */\
1155 "paddw %%mm2, %%mm6 \n\t" /* c */\
1156 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1157 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1158 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1159 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1160 "paddw %%mm4, %%mm0 \n\t" /* a */\
1161 "paddw %%mm1, %%mm5 \n\t" /* d */\
1162 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1163 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1164 "paddw %5, %%mm6 \n\t"\
1165 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1166 "psraw $5, %%mm0 \n\t"\
1167 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1169 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1170 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1171 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1172 "paddw %%mm5, %%mm1 \n\t" /* a */\
1173 "paddw %%mm6, %%mm2 \n\t" /* b */\
1174 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1175 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1176 "paddw %%mm6, %%mm3 \n\t" /* c */\
1177 "paddw %%mm5, %%mm4 \n\t" /* d */\
1178 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1179 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1180 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1181 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1182 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1183 "paddw %5, %%mm1 \n\t"\
1184 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1185 "psraw $5, %%mm3 \n\t"\
1186 "packuswb %%mm3, %%mm0 \n\t"\
1187 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1193 : "+a"(src), "+c"(dst), "+d"(h)\
1194 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1199 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1202 /* quick HACK, XXX FIXME MUST be optimized */\
1205 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1206 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1207 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1208 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1209 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1210 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1211 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1212 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1214 "movq (%0), %%mm0 \n\t"\
1215 "movq 8(%0), %%mm1 \n\t"\
1216 "paddw %2, %%mm0 \n\t"\
1217 "paddw %2, %%mm1 \n\t"\
1218 "psraw $5, %%mm0 \n\t"\
1219 "psraw $5, %%mm1 \n\t"\
1220 "packuswb %%mm1, %%mm0 \n\t"\
1221 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1222 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1230 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1232 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1233 uint64_t temp[17*4];\
1234 uint64_t *temp_ptr= temp;\
1239 "pxor %%mm7, %%mm7 \n\t"\
1241 "movq (%0), %%mm0 \n\t"\
1242 "movq (%0), %%mm1 \n\t"\
1243 "movq 8(%0), %%mm2 \n\t"\
1244 "movq 8(%0), %%mm3 \n\t"\
1245 "punpcklbw %%mm7, %%mm0 \n\t"\
1246 "punpckhbw %%mm7, %%mm1 \n\t"\
1247 "punpcklbw %%mm7, %%mm2 \n\t"\
1248 "punpckhbw %%mm7, %%mm3 \n\t"\
1249 "movq %%mm0, (%1) \n\t"\
1250 "movq %%mm1, 17*8(%1) \n\t"\
1251 "movq %%mm2, 2*17*8(%1) \n\t"\
1252 "movq %%mm3, 3*17*8(%1) \n\t"\
1257 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1258 : "r" ((x86_reg)srcStride)\
1265 /*FIXME reorder for speed */\
1267 /*"pxor %%mm7, %%mm7 \n\t"*/\
1269 "movq (%0), %%mm0 \n\t"\
1270 "movq 8(%0), %%mm1 \n\t"\
1271 "movq 16(%0), %%mm2 \n\t"\
1272 "movq 24(%0), %%mm3 \n\t"\
1273 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1274 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1276 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1278 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1280 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1281 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1283 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1284 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1286 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1287 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1289 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1290 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1292 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1294 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1296 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1297 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1299 "add $136, %0 \n\t"\
1304 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1305 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1310 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1311 uint64_t temp[9*2];\
1312 uint64_t *temp_ptr= temp;\
1317 "pxor %%mm7, %%mm7 \n\t"\
1319 "movq (%0), %%mm0 \n\t"\
1320 "movq (%0), %%mm1 \n\t"\
1321 "punpcklbw %%mm7, %%mm0 \n\t"\
1322 "punpckhbw %%mm7, %%mm1 \n\t"\
1323 "movq %%mm0, (%1) \n\t"\
1324 "movq %%mm1, 9*8(%1) \n\t"\
1329 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1330 : "r" ((x86_reg)srcStride)\
1337 /*FIXME reorder for speed */\
1339 /*"pxor %%mm7, %%mm7 \n\t"*/\
1341 "movq (%0), %%mm0 \n\t"\
1342 "movq 8(%0), %%mm1 \n\t"\
1343 "movq 16(%0), %%mm2 \n\t"\
1344 "movq 24(%0), %%mm3 \n\t"\
1345 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1346 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1348 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1350 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1352 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1354 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1356 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1357 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1364 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1365 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1370 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1371 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1374 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1376 uint8_t * const half= (uint8_t*)temp;\
1377 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1378 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1381 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1382 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1385 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1387 uint8_t * const half= (uint8_t*)temp;\
1388 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1389 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1392 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1394 uint8_t * const half= (uint8_t*)temp;\
1395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1396 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1399 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1400 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1403 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1405 uint8_t * const half= (uint8_t*)temp;\
1406 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1407 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1409 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1410 uint64_t half[8 + 9];\
1411 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1412 uint8_t * const halfHV= ((uint8_t*)half);\
1413 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1414 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1415 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1416 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1418 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1419 uint64_t half[8 + 9];\
1420 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1421 uint8_t * const halfHV= ((uint8_t*)half);\
1422 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1423 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1424 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1425 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1427 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1428 uint64_t half[8 + 9];\
1429 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1430 uint8_t * const halfHV= ((uint8_t*)half);\
1431 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1432 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1433 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1434 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1436 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1437 uint64_t half[8 + 9];\
1438 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1439 uint8_t * const halfHV= ((uint8_t*)half);\
1440 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1441 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1442 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1443 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1445 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1446 uint64_t half[8 + 9];\
1447 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1448 uint8_t * const halfHV= ((uint8_t*)half);\
1449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1450 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1451 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1453 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1454 uint64_t half[8 + 9];\
1455 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1456 uint8_t * const halfHV= ((uint8_t*)half);\
1457 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1458 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1459 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1461 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1462 uint64_t half[8 + 9];\
1463 uint8_t * const halfH= ((uint8_t*)half);\
1464 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1465 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1466 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1468 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1469 uint64_t half[8 + 9];\
1470 uint8_t * const halfH= ((uint8_t*)half);\
1471 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1472 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1473 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1475 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1477 uint8_t * const halfH= ((uint8_t*)half);\
1478 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1479 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1481 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1482 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1485 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1487 uint8_t * const half= (uint8_t*)temp;\
1488 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1489 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1492 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1493 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1496 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1498 uint8_t * const half= (uint8_t*)temp;\
1499 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1500 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1503 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1505 uint8_t * const half= (uint8_t*)temp;\
1506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1507 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1510 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1511 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1514 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1516 uint8_t * const half= (uint8_t*)temp;\
1517 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1518 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1520 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1521 uint64_t half[16*2 + 17*2];\
1522 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1523 uint8_t * const halfHV= ((uint8_t*)half);\
1524 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1525 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1526 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1527 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1529 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1530 uint64_t half[16*2 + 17*2];\
1531 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1532 uint8_t * const halfHV= ((uint8_t*)half);\
1533 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1534 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1535 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1536 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1538 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1539 uint64_t half[16*2 + 17*2];\
1540 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1541 uint8_t * const halfHV= ((uint8_t*)half);\
1542 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1543 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1544 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1545 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1547 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1548 uint64_t half[16*2 + 17*2];\
1549 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1550 uint8_t * const halfHV= ((uint8_t*)half);\
1551 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1552 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1553 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1554 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1556 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1557 uint64_t half[16*2 + 17*2];\
1558 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1559 uint8_t * const halfHV= ((uint8_t*)half);\
1560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1561 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1562 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1564 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1565 uint64_t half[16*2 + 17*2];\
1566 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1567 uint8_t * const halfHV= ((uint8_t*)half);\
1568 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1569 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1570 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1572 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1573 uint64_t half[17*2];\
1574 uint8_t * const halfH= ((uint8_t*)half);\
1575 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1576 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1577 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1579 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1580 uint64_t half[17*2];\
1581 uint8_t * const halfH= ((uint8_t*)half);\
1582 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1583 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1584 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1586 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1587 uint64_t half[17*2];\
1588 uint8_t * const halfH= ((uint8_t*)half);\
1589 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1590 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1593 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1594 #define AVG_3DNOW_OP(a,b,temp, size) \
1595 "mov" #size " " #b ", " #temp " \n\t"\
1596 "pavgusb " #temp ", " #a " \n\t"\
1597 "mov" #size " " #a ", " #b " \n\t"
1598 #define AVG_MMX2_OP(a,b,temp, size) \
1599 "mov" #size " " #b ", " #temp " \n\t"\
1600 "pavgb " #temp ", " #a " \n\t"\
1601 "mov" #size " " #a ", " #b " \n\t"
1603 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1604 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1605 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1606 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1607 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1608 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1609 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1610 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1611 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1613 /***********************************/
1614 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1616 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1617 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1618 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1620 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1621 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1622 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1625 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1626 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1627 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1628 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1629 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1630 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1631 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1632 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1633 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1634 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1635 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1636 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1638 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1639 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1641 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1642 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1643 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1644 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1645 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1646 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1647 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1648 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1650 QPEL_2TAP(put_, 16, mmx2)
1651 QPEL_2TAP(avg_, 16, mmx2)
1652 QPEL_2TAP(put_, 8, mmx2)
1653 QPEL_2TAP(avg_, 8, mmx2)
1654 QPEL_2TAP(put_, 16, 3dnow)
1655 QPEL_2TAP(avg_, 16, 3dnow)
1656 QPEL_2TAP(put_, 8, 3dnow)
1657 QPEL_2TAP(avg_, 8, 3dnow)
1661 static void just_return(void) { return; }
1665 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1666 x86_reg linesize, x86_reg start_y,
1667 x86_reg end_y, x86_reg block_h,
1668 x86_reg start_x, x86_reg end_x,
1670 extern emu_edge_core_func ff_emu_edge_core_mmx;
1671 extern emu_edge_core_func ff_emu_edge_core_sse;
1673 static av_always_inline
1674 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1675 int block_w, int block_h,
1676 int src_x, int src_y, int w, int h,
1677 emu_edge_core_func *core_fn)
1679 int start_y, start_x, end_y, end_x, src_y_add=0;
1682 src_y_add = h-1-src_y;
1684 }else if(src_y<=-block_h){
1685 src_y_add = 1-block_h-src_y;
1691 }else if(src_x<=-block_w){
1692 src+= (1-block_w-src_x);
1696 start_y= FFMAX(0, -src_y);
1697 start_x= FFMAX(0, -src_x);
1698 end_y= FFMIN(block_h, h-src_y);
1699 end_x= FFMIN(block_w, w-src_x);
1700 assert(start_x < end_x && block_w > 0);
1701 assert(start_y < end_y && block_h > 0);
1703 // fill in the to-be-copied part plus all above/below
1704 src += (src_y_add+start_y)*linesize + start_x;
1706 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1711 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1712 int block_w, int block_h,
1713 int src_x, int src_y, int w, int h)
1715 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1716 w, h, &ff_emu_edge_core_mmx);
1720 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1721 int block_w, int block_h,
1722 int src_x, int src_y, int w, int h)
1724 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1725 w, h, &ff_emu_edge_core_sse);
1727 #endif /* HAVE_YASM */
1729 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1730 int linesize, int block_w, int block_h,
1731 int src_x, int src_y, int w, int h);
1733 static av_always_inline
1734 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1735 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1736 emulated_edge_mc_func *emu_edge_fn)
1739 const int ix = ox>>(16+shift);
1740 const int iy = oy>>(16+shift);
1741 const int oxs = ox>>4;
1742 const int oys = oy>>4;
1743 const int dxxs = dxx>>4;
1744 const int dxys = dxy>>4;
1745 const int dyxs = dyx>>4;
1746 const int dyys = dyy>>4;
1747 const uint16_t r4[4] = {r,r,r,r};
1748 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1749 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1750 const uint64_t shift2 = 2*shift;
1751 uint8_t edge_buf[(h+1)*stride];
1754 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1755 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1756 const int dxh = dxy*(h-1);
1757 const int dyw = dyx*(w-1);
1758 if( // non-constant fullpel offset (3% of blocks)
1759 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1760 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1761 // uses more than 16 bits of subpel mv (only at huge resolution)
1762 || (dxx|dxy|dyx|dyy)&15 )
1764 //FIXME could still use mmx for some of the rows
1765 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1769 src += ix + iy*stride;
1770 if( (unsigned)ix >= width-w ||
1771 (unsigned)iy >= height-h )
1773 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1778 "movd %0, %%mm6 \n\t"
1779 "pxor %%mm7, %%mm7 \n\t"
1780 "punpcklwd %%mm6, %%mm6 \n\t"
1781 "punpcklwd %%mm6, %%mm6 \n\t"
1785 for(x=0; x<w; x+=4){
1786 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1787 oxs - dxys + dxxs*(x+1),
1788 oxs - dxys + dxxs*(x+2),
1789 oxs - dxys + dxxs*(x+3) };
1790 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1791 oys - dyys + dyxs*(x+1),
1792 oys - dyys + dyxs*(x+2),
1793 oys - dyys + dyxs*(x+3) };
1797 "movq %0, %%mm4 \n\t"
1798 "movq %1, %%mm5 \n\t"
1799 "paddw %2, %%mm4 \n\t"
1800 "paddw %3, %%mm5 \n\t"
1801 "movq %%mm4, %0 \n\t"
1802 "movq %%mm5, %1 \n\t"
1803 "psrlw $12, %%mm4 \n\t"
1804 "psrlw $12, %%mm5 \n\t"
1805 : "+m"(*dx4), "+m"(*dy4)
1806 : "m"(*dxy4), "m"(*dyy4)
1810 "movq %%mm6, %%mm2 \n\t"
1811 "movq %%mm6, %%mm1 \n\t"
1812 "psubw %%mm4, %%mm2 \n\t"
1813 "psubw %%mm5, %%mm1 \n\t"
1814 "movq %%mm2, %%mm0 \n\t"
1815 "movq %%mm4, %%mm3 \n\t"
1816 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1817 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1818 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1819 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1821 "movd %4, %%mm5 \n\t"
1822 "movd %3, %%mm4 \n\t"
1823 "punpcklbw %%mm7, %%mm5 \n\t"
1824 "punpcklbw %%mm7, %%mm4 \n\t"
1825 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1826 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1828 "movd %2, %%mm5 \n\t"
1829 "movd %1, %%mm4 \n\t"
1830 "punpcklbw %%mm7, %%mm5 \n\t"
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1833 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1834 "paddw %5, %%mm1 \n\t"
1835 "paddw %%mm3, %%mm2 \n\t"
1836 "paddw %%mm1, %%mm0 \n\t"
1837 "paddw %%mm2, %%mm0 \n\t"
1839 "psrlw %6, %%mm0 \n\t"
1840 "packuswb %%mm0, %%mm0 \n\t"
1841 "movd %%mm0, %0 \n\t"
1843 : "=m"(dst[x+y*stride])
1844 : "m"(src[0]), "m"(src[1]),
1845 "m"(src[stride]), "m"(src[stride+1]),
1846 "m"(*r4), "m"(shift2)
1856 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1857 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1859 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1860 width, height, &emulated_edge_mc_mmx);
1863 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1864 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1866 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1867 width, height, &emulated_edge_mc_sse);
1870 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1871 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1873 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1874 width, height, &ff_emulated_edge_mc);
1878 #define PREFETCH(name, op) \
1879 static void name(void *mem, int stride, int h){\
1880 const uint8_t *p= mem;\
1882 __asm__ volatile(#op" %0" :: "m"(*p));\
1886 PREFETCH(prefetch_mmx2, prefetcht0)
1887 PREFETCH(prefetch_3dnow, prefetch)
1890 #include "h264_qpel_mmx.c"
1892 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1893 int stride, int h, int x, int y);
1894 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1895 int stride, int h, int x, int y);
1896 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1897 int stride, int h, int x, int y);
1898 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1899 int stride, int h, int x, int y);
1900 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1901 int stride, int h, int x, int y);
1902 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1903 int stride, int h, int x, int y);
1905 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1906 int stride, int h, int x, int y);
1907 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1908 int stride, int h, int x, int y);
1909 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1910 int stride, int h, int x, int y);
1911 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1912 int stride, int h, int x, int y);
1913 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1914 int stride, int h, int x, int y);
1915 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1916 int stride, int h, int x, int y);
1918 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1919 int stride, int h, int x, int y);
1920 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1921 int stride, int h, int x, int y);
1923 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1924 int stride, int h, int x, int y);
1925 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1926 int stride, int h, int x, int y);
1928 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1929 int stride, int h, int x, int y);
1930 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1931 int stride, int h, int x, int y);
1935 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1936 put_pixels8_mmx(dst, src, stride, 8);
1938 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1939 avg_pixels8_mmx(dst, src, stride, 8);
1941 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1942 put_pixels16_mmx(dst, src, stride, 16);
1944 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1945 avg_pixels16_mmx(dst, src, stride, 16);
1949 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1950 put_pixels8_mmx(dst, src, stride, 8);
1952 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1953 avg_pixels8_mmx2(dst, src, stride, 8);
1956 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1959 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1961 ff_mmx_idct (block);
1962 ff_put_pixels_clamped_mmx(block, dest, line_size);
1964 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1966 ff_mmx_idct (block);
1967 ff_add_pixels_clamped_mmx(block, dest, line_size);
1969 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1971 ff_mmxext_idct (block);
1972 ff_put_pixels_clamped_mmx(block, dest, line_size);
1974 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1976 ff_mmxext_idct (block);
1977 ff_add_pixels_clamped_mmx(block, dest, line_size);
1980 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1982 ff_idct_xvid_mmx (block);
1983 ff_put_pixels_clamped_mmx(block, dest, line_size);
1985 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1987 ff_idct_xvid_mmx (block);
1988 ff_add_pixels_clamped_mmx(block, dest, line_size);
1990 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1992 ff_idct_xvid_mmx2 (block);
1993 ff_put_pixels_clamped_mmx(block, dest, line_size);
1995 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1997 ff_idct_xvid_mmx2 (block);
1998 ff_add_pixels_clamped_mmx(block, dest, line_size);
2001 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2004 __asm__ volatile("pxor %%mm7, %%mm7":);
2005 for(i=0; i<blocksize; i+=2) {
2007 "movq %0, %%mm0 \n\t"
2008 "movq %1, %%mm1 \n\t"
2009 "movq %%mm0, %%mm2 \n\t"
2010 "movq %%mm1, %%mm3 \n\t"
2011 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2012 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2013 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2014 "pxor %%mm2, %%mm1 \n\t"
2015 "movq %%mm3, %%mm4 \n\t"
2016 "pand %%mm1, %%mm3 \n\t"
2017 "pandn %%mm1, %%mm4 \n\t"
2018 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2019 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2020 "movq %%mm3, %1 \n\t"
2021 "movq %%mm0, %0 \n\t"
2022 :"+m"(mag[i]), "+m"(ang[i])
2026 __asm__ volatile("femms");
2028 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2033 "movaps %0, %%xmm5 \n\t"
2034 ::"m"(ff_pdw_80000000[0])
2036 for(i=0; i<blocksize; i+=4) {
2038 "movaps %0, %%xmm0 \n\t"
2039 "movaps %1, %%xmm1 \n\t"
2040 "xorps %%xmm2, %%xmm2 \n\t"
2041 "xorps %%xmm3, %%xmm3 \n\t"
2042 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2043 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2044 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2045 "xorps %%xmm2, %%xmm1 \n\t"
2046 "movaps %%xmm3, %%xmm4 \n\t"
2047 "andps %%xmm1, %%xmm3 \n\t"
2048 "andnps %%xmm1, %%xmm4 \n\t"
2049 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2050 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2051 "movaps %%xmm3, %1 \n\t"
2052 "movaps %%xmm0, %0 \n\t"
2053 :"+m"(mag[i]), "+m"(ang[i])
2062 #define MIX5(mono,stereo)\
2064 "movss 0(%2), %%xmm5 \n"\
2065 "movss 8(%2), %%xmm6 \n"\
2066 "movss 24(%2), %%xmm7 \n"\
2067 "shufps $0, %%xmm5, %%xmm5 \n"\
2068 "shufps $0, %%xmm6, %%xmm6 \n"\
2069 "shufps $0, %%xmm7, %%xmm7 \n"\
2071 "movaps (%0,%1), %%xmm0 \n"\
2072 "movaps 0x400(%0,%1), %%xmm1 \n"\
2073 "movaps 0x800(%0,%1), %%xmm2 \n"\
2074 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2075 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2076 "mulps %%xmm5, %%xmm0 \n"\
2077 "mulps %%xmm6, %%xmm1 \n"\
2078 "mulps %%xmm5, %%xmm2 \n"\
2079 "mulps %%xmm7, %%xmm3 \n"\
2080 "mulps %%xmm7, %%xmm4 \n"\
2081 stereo("addps %%xmm1, %%xmm0 \n")\
2082 "addps %%xmm1, %%xmm2 \n"\
2083 "addps %%xmm3, %%xmm0 \n"\
2084 "addps %%xmm4, %%xmm2 \n"\
2085 mono("addps %%xmm2, %%xmm0 \n")\
2086 "movaps %%xmm0, (%0,%1) \n"\
2087 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2091 :"r"(samples[0]+len), "r"(matrix)\
2092 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2093 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2097 #define MIX_MISC(stereo)\
2100 "movaps (%3,%0), %%xmm0 \n"\
2101 stereo("movaps %%xmm0, %%xmm1 \n")\
2102 "mulps %%xmm4, %%xmm0 \n"\
2103 stereo("mulps %%xmm5, %%xmm1 \n")\
2104 "lea 1024(%3,%0), %1 \n"\
2107 "movaps (%1), %%xmm2 \n"\
2108 stereo("movaps %%xmm2, %%xmm3 \n")\
2109 "mulps (%4,%2), %%xmm2 \n"\
2110 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2111 "addps %%xmm2, %%xmm0 \n"\
2112 stereo("addps %%xmm3, %%xmm1 \n")\
2116 "movaps %%xmm0, (%3,%0) \n"\
2117 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2120 :"+&r"(i), "=&r"(j), "=&r"(k)\
2121 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2125 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2127 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2130 i = -len*sizeof(float);
2131 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2133 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2136 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2137 j = 2*in_ch*sizeof(float);
2141 "movss (%2,%0), %%xmm4 \n"
2142 "movss 4(%2,%0), %%xmm5 \n"
2143 "shufps $0, %%xmm4, %%xmm4 \n"
2144 "shufps $0, %%xmm5, %%xmm5 \n"
2145 "movaps %%xmm4, (%1,%0,4) \n"
2146 "movaps %%xmm5, 16(%1,%0,4) \n"
2149 :"r"(matrix_simd), "r"(matrix)
2160 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2161 x86_reg i = (len-4)*4;
2164 "movq (%2,%0), %%mm0 \n\t"
2165 "movq 8(%2,%0), %%mm1 \n\t"
2166 "pfmul (%3,%0), %%mm0 \n\t"
2167 "pfmul 8(%3,%0), %%mm1 \n\t"
2168 "movq %%mm0, (%1,%0) \n\t"
2169 "movq %%mm1, 8(%1,%0) \n\t"
2174 :"r"(dst), "r"(src0), "r"(src1)
2178 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2179 x86_reg i = (len-8)*4;
2182 "movaps (%2,%0), %%xmm0 \n\t"
2183 "movaps 16(%2,%0), %%xmm1 \n\t"
2184 "mulps (%3,%0), %%xmm0 \n\t"
2185 "mulps 16(%3,%0), %%xmm1 \n\t"
2186 "movaps %%xmm0, (%1,%0) \n\t"
2187 "movaps %%xmm1, 16(%1,%0) \n\t"
2191 :"r"(dst), "r"(src0), "r"(src1)
2196 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2197 x86_reg i = len*4-16;
2200 "pswapd 8(%1), %%mm0 \n\t"
2201 "pswapd (%1), %%mm1 \n\t"
2202 "pfmul (%3,%0), %%mm0 \n\t"
2203 "pfmul 8(%3,%0), %%mm1 \n\t"
2204 "movq %%mm0, (%2,%0) \n\t"
2205 "movq %%mm1, 8(%2,%0) \n\t"
2209 :"+r"(i), "+r"(src1)
2210 :"r"(dst), "r"(src0)
2212 __asm__ volatile("femms");
2214 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2215 x86_reg i = len*4-32;
2218 "movaps 16(%1), %%xmm0 \n\t"
2219 "movaps (%1), %%xmm1 \n\t"
2220 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2221 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2222 "mulps (%3,%0), %%xmm0 \n\t"
2223 "mulps 16(%3,%0), %%xmm1 \n\t"
2224 "movaps %%xmm0, (%2,%0) \n\t"
2225 "movaps %%xmm1, 16(%2,%0) \n\t"
2229 :"+r"(i), "+r"(src1)
2230 :"r"(dst), "r"(src0)
2234 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2235 const float *src2, int len){
2236 x86_reg i = (len-4)*4;
2239 "movq (%2,%0), %%mm0 \n\t"
2240 "movq 8(%2,%0), %%mm1 \n\t"
2241 "pfmul (%3,%0), %%mm0 \n\t"
2242 "pfmul 8(%3,%0), %%mm1 \n\t"
2243 "pfadd (%4,%0), %%mm0 \n\t"
2244 "pfadd 8(%4,%0), %%mm1 \n\t"
2245 "movq %%mm0, (%1,%0) \n\t"
2246 "movq %%mm1, 8(%1,%0) \n\t"
2250 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2253 __asm__ volatile("femms");
2255 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2256 const float *src2, int len){
2257 x86_reg i = (len-8)*4;
2260 "movaps (%2,%0), %%xmm0 \n\t"
2261 "movaps 16(%2,%0), %%xmm1 \n\t"
2262 "mulps (%3,%0), %%xmm0 \n\t"
2263 "mulps 16(%3,%0), %%xmm1 \n\t"
2264 "addps (%4,%0), %%xmm0 \n\t"
2265 "addps 16(%4,%0), %%xmm1 \n\t"
2266 "movaps %%xmm0, (%1,%0) \n\t"
2267 "movaps %%xmm1, 16(%1,%0) \n\t"
2271 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2277 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2278 const float *win, int len){
2280 x86_reg j = len*4-8;
2283 "pswapd (%5,%1), %%mm1 \n"
2284 "movq (%5,%0), %%mm0 \n"
2285 "pswapd (%4,%1), %%mm5 \n"
2286 "movq (%3,%0), %%mm4 \n"
2287 "movq %%mm0, %%mm2 \n"
2288 "movq %%mm1, %%mm3 \n"
2289 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2290 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2291 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2292 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2293 "pfadd %%mm3, %%mm2 \n"
2294 "pfsub %%mm0, %%mm1 \n"
2295 "pswapd %%mm2, %%mm2 \n"
2296 "movq %%mm1, (%2,%0) \n"
2297 "movq %%mm2, (%2,%1) \n"
2303 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2307 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2308 const float *win, int len){
2310 x86_reg j = len*4-16;
2313 "movaps (%5,%1), %%xmm1 \n"
2314 "movaps (%5,%0), %%xmm0 \n"
2315 "movaps (%4,%1), %%xmm5 \n"
2316 "movaps (%3,%0), %%xmm4 \n"
2317 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2318 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2319 "movaps %%xmm0, %%xmm2 \n"
2320 "movaps %%xmm1, %%xmm3 \n"
2321 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2322 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2323 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2324 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2325 "addps %%xmm3, %%xmm2 \n"
2326 "subps %%xmm0, %%xmm1 \n"
2327 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2328 "movaps %%xmm1, (%2,%0) \n"
2329 "movaps %%xmm2, (%2,%1) \n"
2334 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2337 #endif /* HAVE_6REGS */
2339 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2342 x86_reg i = (len-16)*4;
2344 "movss %3, %%xmm4 \n"
2345 "movss %4, %%xmm5 \n"
2346 "shufps $0, %%xmm4, %%xmm4 \n"
2347 "shufps $0, %%xmm5, %%xmm5 \n"
2349 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2350 "movaps 16(%2,%0), %%xmm1 \n\t"
2351 "movaps 32(%2,%0), %%xmm2 \n\t"
2352 "movaps 48(%2,%0), %%xmm3 \n\t"
2353 "maxps %%xmm4, %%xmm0 \n\t"
2354 "maxps %%xmm4, %%xmm1 \n\t"
2355 "maxps %%xmm4, %%xmm2 \n\t"
2356 "maxps %%xmm4, %%xmm3 \n\t"
2357 "minps %%xmm5, %%xmm0 \n\t"
2358 "minps %%xmm5, %%xmm1 \n\t"
2359 "minps %%xmm5, %%xmm2 \n\t"
2360 "minps %%xmm5, %%xmm3 \n\t"
2361 "movaps %%xmm0, (%1,%0) \n\t"
2362 "movaps %%xmm1, 16(%1,%0) \n\t"
2363 "movaps %%xmm2, 32(%1,%0) \n\t"
2364 "movaps %%xmm3, 48(%1,%0) \n\t"
2368 :"r"(dst), "r"(src), "m"(min), "m"(max)
2373 void ff_vp3_idct_mmx(int16_t *input_data);
2374 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2375 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2377 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2379 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2380 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2382 void ff_vp3_idct_sse2(int16_t *input_data);
2383 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2384 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2386 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2387 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2388 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2389 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2390 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2391 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2392 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2393 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2395 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2397 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2399 int mm_flags = av_get_cpu_flags();
2401 if (avctx->dsp_mask) {
2402 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2403 mm_flags |= (avctx->dsp_mask & 0xffff);
2405 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2409 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2410 if (mm_flags & AV_CPU_FLAG_MMX)
2411 av_log(avctx, AV_LOG_INFO, " mmx");
2412 if (mm_flags & AV_CPU_FLAG_MMX2)
2413 av_log(avctx, AV_LOG_INFO, " mmx2");
2414 if (mm_flags & AV_CPU_FLAG_3DNOW)
2415 av_log(avctx, AV_LOG_INFO, " 3dnow");
2416 if (mm_flags & AV_CPU_FLAG_SSE)
2417 av_log(avctx, AV_LOG_INFO, " sse");
2418 if (mm_flags & AV_CPU_FLAG_SSE2)
2419 av_log(avctx, AV_LOG_INFO, " sse2");
2420 av_log(avctx, AV_LOG_INFO, "\n");
2423 if (mm_flags & AV_CPU_FLAG_MMX) {
2424 const int idct_algo= avctx->idct_algo;
2426 if(avctx->lowres==0){
2427 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2428 c->idct_put= ff_simple_idct_put_mmx;
2429 c->idct_add= ff_simple_idct_add_mmx;
2430 c->idct = ff_simple_idct_mmx;
2431 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2433 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2434 if(mm_flags & AV_CPU_FLAG_MMX2){
2435 c->idct_put= ff_libmpeg2mmx2_idct_put;
2436 c->idct_add= ff_libmpeg2mmx2_idct_add;
2437 c->idct = ff_mmxext_idct;
2439 c->idct_put= ff_libmpeg2mmx_idct_put;
2440 c->idct_add= ff_libmpeg2mmx_idct_add;
2441 c->idct = ff_mmx_idct;
2443 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2445 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2446 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2447 if(mm_flags & AV_CPU_FLAG_SSE2){
2448 c->idct_put= ff_vp3_idct_put_sse2;
2449 c->idct_add= ff_vp3_idct_add_sse2;
2450 c->idct = ff_vp3_idct_sse2;
2451 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2453 c->idct_put= ff_vp3_idct_put_mmx;
2454 c->idct_add= ff_vp3_idct_add_mmx;
2455 c->idct = ff_vp3_idct_mmx;
2456 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2458 }else if(idct_algo==FF_IDCT_CAVS){
2459 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2460 }else if(idct_algo==FF_IDCT_XVIDMMX){
2461 if(mm_flags & AV_CPU_FLAG_SSE2){
2462 c->idct_put= ff_idct_xvid_sse2_put;
2463 c->idct_add= ff_idct_xvid_sse2_add;
2464 c->idct = ff_idct_xvid_sse2;
2465 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2466 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2467 c->idct_put= ff_idct_xvid_mmx2_put;
2468 c->idct_add= ff_idct_xvid_mmx2_add;
2469 c->idct = ff_idct_xvid_mmx2;
2471 c->idct_put= ff_idct_xvid_mmx_put;
2472 c->idct_add= ff_idct_xvid_mmx_add;
2473 c->idct = ff_idct_xvid_mmx;
2478 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2479 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2480 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2481 c->clear_block = clear_block_mmx;
2482 c->clear_blocks = clear_blocks_mmx;
2483 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2484 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2485 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2486 c->clear_block = clear_block_sse;
2487 c->clear_blocks = clear_blocks_sse;
2490 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2491 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2492 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2493 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2494 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2496 SET_HPEL_FUNCS(put, 0, 16, mmx);
2497 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2498 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2499 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2500 SET_HPEL_FUNCS(put, 1, 8, mmx);
2501 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2502 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2503 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2505 #if ARCH_X86_32 || !HAVE_YASM
2508 #if ARCH_X86_32 && HAVE_YASM
2509 c->emulated_edge_mc = emulated_edge_mc_mmx;
2512 c->add_bytes= add_bytes_mmx;
2513 c->add_bytes_l2= add_bytes_l2_mmx;
2515 c->draw_edges = draw_edges_mmx;
2517 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2518 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2519 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2523 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2524 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2526 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2527 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2530 if (mm_flags & AV_CPU_FLAG_MMX2) {
2531 c->prefetch = prefetch_mmx2;
2533 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2534 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2536 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2537 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2538 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2540 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2541 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2543 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2544 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2545 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2547 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2548 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2549 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2550 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2551 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2552 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2553 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2555 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2556 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2557 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2560 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2561 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2564 if (CONFIG_VP3_DECODER
2565 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2566 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2567 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2570 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2571 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2572 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2573 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2574 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2575 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2576 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2577 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2578 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2579 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2580 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2581 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2582 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2583 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2584 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2585 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2586 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2588 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2589 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2590 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2591 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2592 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2593 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2595 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2596 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2597 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2598 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2599 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2600 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2602 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2603 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2604 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2605 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2608 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2609 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2611 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2612 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2613 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2614 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2616 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2618 #if HAVE_7REGS && HAVE_TEN_OPERANDS
2619 if( mm_flags&AV_CPU_FLAG_3DNOW )
2620 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2623 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2624 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2625 c->prefetch = prefetch_3dnow;
2627 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2628 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2630 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2631 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2632 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2634 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2635 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2637 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2638 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2639 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2641 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2642 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2643 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2644 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2645 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2646 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2647 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2650 if (CONFIG_VP3_DECODER
2651 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2652 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2653 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2656 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2657 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2658 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2659 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2660 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2661 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2663 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2664 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2665 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2666 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2667 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2668 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2670 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2671 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2672 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2673 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2676 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2677 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2679 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2680 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2685 #define H264_QPEL_FUNCS(x, y, CPU)\
2686 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2687 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2688 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2689 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2690 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2691 // these functions are slower than mmx on AMD, but faster on Intel
2692 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2693 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2694 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2695 H264_QPEL_FUNCS(0, 0, sse2);
2697 if(mm_flags & AV_CPU_FLAG_SSE2){
2698 H264_QPEL_FUNCS(0, 1, sse2);
2699 H264_QPEL_FUNCS(0, 2, sse2);
2700 H264_QPEL_FUNCS(0, 3, sse2);
2701 H264_QPEL_FUNCS(1, 1, sse2);
2702 H264_QPEL_FUNCS(1, 2, sse2);
2703 H264_QPEL_FUNCS(1, 3, sse2);
2704 H264_QPEL_FUNCS(2, 1, sse2);
2705 H264_QPEL_FUNCS(2, 2, sse2);
2706 H264_QPEL_FUNCS(2, 3, sse2);
2707 H264_QPEL_FUNCS(3, 1, sse2);
2708 H264_QPEL_FUNCS(3, 2, sse2);
2709 H264_QPEL_FUNCS(3, 3, sse2);
2712 if(mm_flags & AV_CPU_FLAG_SSSE3){
2713 H264_QPEL_FUNCS(1, 0, ssse3);
2714 H264_QPEL_FUNCS(1, 1, ssse3);
2715 H264_QPEL_FUNCS(1, 2, ssse3);
2716 H264_QPEL_FUNCS(1, 3, ssse3);
2717 H264_QPEL_FUNCS(2, 0, ssse3);
2718 H264_QPEL_FUNCS(2, 1, ssse3);
2719 H264_QPEL_FUNCS(2, 2, ssse3);
2720 H264_QPEL_FUNCS(2, 3, ssse3);
2721 H264_QPEL_FUNCS(3, 0, ssse3);
2722 H264_QPEL_FUNCS(3, 1, ssse3);
2723 H264_QPEL_FUNCS(3, 2, ssse3);
2724 H264_QPEL_FUNCS(3, 3, ssse3);
2725 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2727 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2728 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2729 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2730 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2731 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2732 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2733 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2738 if(mm_flags & AV_CPU_FLAG_3DNOW){
2739 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2740 c->vector_fmul = vector_fmul_3dnow;
2742 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2743 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2745 c->vector_fmul_window = vector_fmul_window_3dnow2;
2748 if(mm_flags & AV_CPU_FLAG_MMX2){
2750 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2751 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2754 if(mm_flags & AV_CPU_FLAG_SSE){
2755 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2756 c->ac3_downmix = ac3_downmix_sse;
2757 c->vector_fmul = vector_fmul_sse;
2758 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2759 c->vector_fmul_add = vector_fmul_add_sse;
2761 c->vector_fmul_window = vector_fmul_window_sse;
2763 c->vector_clipf = vector_clipf_sse;
2765 c->scalarproduct_float = ff_scalarproduct_float_sse;
2768 if(mm_flags & AV_CPU_FLAG_3DNOW)
2769 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2770 if(mm_flags & AV_CPU_FLAG_SSE2){
2772 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2773 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2775 c->emulated_edge_mc = emulated_edge_mc_sse;
2779 if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
2780 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2783 if (CONFIG_ENCODERS)
2784 dsputilenc_init_mmx(c, avctx);
2787 // for speed testing
2788 get_pixels = just_return;
2789 put_pixels_clamped = just_return;
2790 add_pixels_clamped = just_return;
2792 pix_abs16x16 = just_return;
2793 pix_abs16x16_x2 = just_return;
2794 pix_abs16x16_y2 = just_return;
2795 pix_abs16x16_xy2 = just_return;
2797 put_pixels_tab[0] = just_return;
2798 put_pixels_tab[1] = just_return;
2799 put_pixels_tab[2] = just_return;
2800 put_pixels_tab[3] = just_return;
2802 put_no_rnd_pixels_tab[0] = just_return;
2803 put_no_rnd_pixels_tab[1] = just_return;
2804 put_no_rnd_pixels_tab[2] = just_return;
2805 put_no_rnd_pixels_tab[3] = just_return;
2807 avg_pixels_tab[0] = just_return;
2808 avg_pixels_tab[1] = just_return;
2809 avg_pixels_tab[2] = just_return;
2810 avg_pixels_tab[3] = just_return;
2812 avg_no_rnd_pixels_tab[0] = just_return;
2813 avg_no_rnd_pixels_tab[1] = just_return;
2814 avg_no_rnd_pixels_tab[2] = just_return;
2815 avg_no_rnd_pixels_tab[3] = just_return;
2817 //av_fdct = just_return;
2818 //ff_idct = just_return;