2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "dsputil_mmx.h"
27 #include "simple_idct.h"
28 #include "mpegvideo.h"
31 #include "vp3dsp_mmx.h"
32 #include "vp3dsp_sse2.h"
38 extern void ff_idct_xvid_mmx(short *block);
39 extern void ff_idct_xvid_mmx2(short *block);
41 int mm_flags; /* multimedia extension flags */
43 /* pixel operations */
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5 ) = 0x0005000500050005ULL;
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
55 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
61 DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
70 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
71 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
73 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
74 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
76 #define MOVQ_WONE(regd) \
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
79 "psrlw $15, %%" #regd ::)
81 #define MOVQ_BFE(regd) \
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
84 "paddb %%" #regd ", %%" #regd " \n\t" ::)
87 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
88 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
90 // for shared library it's better to use this way for accessing constants
92 #define MOVQ_BONE(regd) \
94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95 "psrlw $15, %%" #regd " \n\t" \
96 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
98 #define MOVQ_WTWO(regd) \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "psllw $1, %%" #regd " \n\t"::)
106 // using regr as temporary and for the output result
107 // first argument is unmodifed and second is trashed
108 // regfe is supposed to contain 0xfefefefefefefefe
109 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
110 "movq " #rega ", " #regr " \n\t"\
111 "pand " #regb ", " #regr " \n\t"\
112 "pxor " #rega ", " #regb " \n\t"\
113 "pand " #regfe "," #regb " \n\t"\
114 "psrlq $1, " #regb " \n\t"\
115 "paddb " #regb ", " #regr " \n\t"
117 #define PAVGB_MMX(rega, regb, regr, regfe) \
118 "movq " #rega ", " #regr " \n\t"\
119 "por " #regb ", " #regr " \n\t"\
120 "pxor " #rega ", " #regb " \n\t"\
121 "pand " #regfe "," #regb " \n\t"\
122 "psrlq $1, " #regb " \n\t"\
123 "psubb " #regb ", " #regr " \n\t"
125 // mm6 is supposed to contain 0xfefefefefefefefe
126 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
127 "movq " #rega ", " #regr " \n\t"\
128 "movq " #regc ", " #regp " \n\t"\
129 "pand " #regb ", " #regr " \n\t"\
130 "pand " #regd ", " #regp " \n\t"\
131 "pxor " #rega ", " #regb " \n\t"\
132 "pxor " #regc ", " #regd " \n\t"\
133 "pand %%mm6, " #regb " \n\t"\
134 "pand %%mm6, " #regd " \n\t"\
135 "psrlq $1, " #regb " \n\t"\
136 "psrlq $1, " #regd " \n\t"\
137 "paddb " #regb ", " #regr " \n\t"\
138 "paddb " #regd ", " #regp " \n\t"
140 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
141 "movq " #rega ", " #regr " \n\t"\
142 "movq " #regc ", " #regp " \n\t"\
143 "por " #regb ", " #regr " \n\t"\
144 "por " #regd ", " #regp " \n\t"\
145 "pxor " #rega ", " #regb " \n\t"\
146 "pxor " #regc ", " #regd " \n\t"\
147 "pand %%mm6, " #regb " \n\t"\
148 "pand %%mm6, " #regd " \n\t"\
149 "psrlq $1, " #regd " \n\t"\
150 "psrlq $1, " #regb " \n\t"\
151 "psubb " #regb ", " #regr " \n\t"\
152 "psubb " #regd ", " #regp " \n\t"
154 /***********************************/
155 /* MMX no rounding */
156 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
157 #define SET_RND MOVQ_WONE
158 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
159 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
161 #include "dsputil_mmx_rnd.h"
167 /***********************************/
170 #define DEF(x, y) x ## _ ## y ##_mmx
171 #define SET_RND MOVQ_WTWO
172 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
173 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
175 #include "dsputil_mmx_rnd.h"
182 /***********************************/
185 #define DEF(x) x ## _3dnow
186 #define PAVGB "pavgusb"
188 #include "dsputil_mmx_avg.h"
193 /***********************************/
196 #define DEF(x) x ## _mmx2
198 /* Introduced only in MMX2 set */
199 #define PAVGB "pavgb"
201 #include "dsputil_mmx_avg.h"
206 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
207 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
208 #define put_pixels16_mmx2 put_pixels16_mmx
209 #define put_pixels8_mmx2 put_pixels8_mmx
210 #define put_pixels4_mmx2 put_pixels4_mmx
211 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
212 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
213 #define put_pixels16_3dnow put_pixels16_mmx
214 #define put_pixels8_3dnow put_pixels8_mmx
215 #define put_pixels4_3dnow put_pixels4_mmx
216 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
217 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
219 /***********************************/
222 #ifdef CONFIG_ENCODERS
223 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
226 "mov $-128, %%"REG_a" \n\t"
227 "pxor %%mm7, %%mm7 \n\t"
230 "movq (%0), %%mm0 \n\t"
231 "movq (%0, %2), %%mm2 \n\t"
232 "movq %%mm0, %%mm1 \n\t"
233 "movq %%mm2, %%mm3 \n\t"
234 "punpcklbw %%mm7, %%mm0 \n\t"
235 "punpckhbw %%mm7, %%mm1 \n\t"
236 "punpcklbw %%mm7, %%mm2 \n\t"
237 "punpckhbw %%mm7, %%mm3 \n\t"
238 "movq %%mm0, (%1, %%"REG_a") \n\t"
239 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
240 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
241 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
243 "add $32, %%"REG_a" \n\t"
246 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
251 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
254 "pxor %%mm7, %%mm7 \n\t"
255 "mov $-128, %%"REG_a" \n\t"
258 "movq (%0), %%mm0 \n\t"
259 "movq (%1), %%mm2 \n\t"
260 "movq %%mm0, %%mm1 \n\t"
261 "movq %%mm2, %%mm3 \n\t"
262 "punpcklbw %%mm7, %%mm0 \n\t"
263 "punpckhbw %%mm7, %%mm1 \n\t"
264 "punpcklbw %%mm7, %%mm2 \n\t"
265 "punpckhbw %%mm7, %%mm3 \n\t"
266 "psubw %%mm2, %%mm0 \n\t"
267 "psubw %%mm3, %%mm1 \n\t"
268 "movq %%mm0, (%2, %%"REG_a") \n\t"
269 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
272 "add $16, %%"REG_a" \n\t"
274 : "+r" (s1), "+r" (s2)
275 : "r" (block+64), "r" ((long)stride)
279 #endif //CONFIG_ENCODERS
281 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
286 /* read the pixels */
291 "movq %3, %%mm0 \n\t"
292 "movq 8%3, %%mm1 \n\t"
293 "movq 16%3, %%mm2 \n\t"
294 "movq 24%3, %%mm3 \n\t"
295 "movq 32%3, %%mm4 \n\t"
296 "movq 40%3, %%mm5 \n\t"
297 "movq 48%3, %%mm6 \n\t"
298 "movq 56%3, %%mm7 \n\t"
299 "packuswb %%mm1, %%mm0 \n\t"
300 "packuswb %%mm3, %%mm2 \n\t"
301 "packuswb %%mm5, %%mm4 \n\t"
302 "packuswb %%mm7, %%mm6 \n\t"
303 "movq %%mm0, (%0) \n\t"
304 "movq %%mm2, (%0, %1) \n\t"
305 "movq %%mm4, (%0, %1, 2) \n\t"
306 "movq %%mm6, (%0, %2) \n\t"
307 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
312 // if here would be an exact copy of the code above
313 // compiler would generate some very strange code
316 "movq (%3), %%mm0 \n\t"
317 "movq 8(%3), %%mm1 \n\t"
318 "movq 16(%3), %%mm2 \n\t"
319 "movq 24(%3), %%mm3 \n\t"
320 "movq 32(%3), %%mm4 \n\t"
321 "movq 40(%3), %%mm5 \n\t"
322 "movq 48(%3), %%mm6 \n\t"
323 "movq 56(%3), %%mm7 \n\t"
324 "packuswb %%mm1, %%mm0 \n\t"
325 "packuswb %%mm3, %%mm2 \n\t"
326 "packuswb %%mm5, %%mm4 \n\t"
327 "packuswb %%mm7, %%mm6 \n\t"
328 "movq %%mm0, (%0) \n\t"
329 "movq %%mm2, (%0, %1) \n\t"
330 "movq %%mm4, (%0, %1, 2) \n\t"
331 "movq %%mm6, (%0, %2) \n\t"
332 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
336 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
337 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
339 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
343 movq_m2r(*vector128, mm1);
344 for (i = 0; i < 8; i++) {
345 movq_m2r(*(block), mm0);
346 packsswb_m2r(*(block + 4), mm0);
349 movq_r2m(mm0, *pixels);
354 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
360 /* read the pixels */
367 "movq (%2), %%mm0 \n\t"
368 "movq 8(%2), %%mm1 \n\t"
369 "movq 16(%2), %%mm2 \n\t"
370 "movq 24(%2), %%mm3 \n\t"
371 "movq %0, %%mm4 \n\t"
372 "movq %1, %%mm6 \n\t"
373 "movq %%mm4, %%mm5 \n\t"
374 "punpcklbw %%mm7, %%mm4 \n\t"
375 "punpckhbw %%mm7, %%mm5 \n\t"
376 "paddsw %%mm4, %%mm0 \n\t"
377 "paddsw %%mm5, %%mm1 \n\t"
378 "movq %%mm6, %%mm5 \n\t"
379 "punpcklbw %%mm7, %%mm6 \n\t"
380 "punpckhbw %%mm7, %%mm5 \n\t"
381 "paddsw %%mm6, %%mm2 \n\t"
382 "paddsw %%mm5, %%mm3 \n\t"
383 "packuswb %%mm1, %%mm0 \n\t"
384 "packuswb %%mm3, %%mm2 \n\t"
385 "movq %%mm0, %0 \n\t"
386 "movq %%mm2, %1 \n\t"
387 :"+m"(*pix), "+m"(*(pix+line_size))
395 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
398 "lea (%3, %3), %%"REG_a" \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd (%1, %3), %%mm1 \n\t"
403 "movd %%mm0, (%2) \n\t"
404 "movd %%mm1, (%2, %3) \n\t"
405 "add %%"REG_a", %1 \n\t"
406 "add %%"REG_a", %2 \n\t"
407 "movd (%1), %%mm0 \n\t"
408 "movd (%1, %3), %%mm1 \n\t"
409 "movd %%mm0, (%2) \n\t"
410 "movd %%mm1, (%2, %3) \n\t"
411 "add %%"REG_a", %1 \n\t"
412 "add %%"REG_a", %2 \n\t"
415 : "+g"(h), "+r" (pixels), "+r" (block)
416 : "r"((long)line_size)
421 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
424 "lea (%3, %3), %%"REG_a" \n\t"
427 "movq (%1), %%mm0 \n\t"
428 "movq (%1, %3), %%mm1 \n\t"
429 "movq %%mm0, (%2) \n\t"
430 "movq %%mm1, (%2, %3) \n\t"
431 "add %%"REG_a", %1 \n\t"
432 "add %%"REG_a", %2 \n\t"
433 "movq (%1), %%mm0 \n\t"
434 "movq (%1, %3), %%mm1 \n\t"
435 "movq %%mm0, (%2) \n\t"
436 "movq %%mm1, (%2, %3) \n\t"
437 "add %%"REG_a", %1 \n\t"
438 "add %%"REG_a", %2 \n\t"
441 : "+g"(h), "+r" (pixels), "+r" (block)
442 : "r"((long)line_size)
447 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
450 "lea (%3, %3), %%"REG_a" \n\t"
453 "movq (%1), %%mm0 \n\t"
454 "movq 8(%1), %%mm4 \n\t"
455 "movq (%1, %3), %%mm1 \n\t"
456 "movq 8(%1, %3), %%mm5 \n\t"
457 "movq %%mm0, (%2) \n\t"
458 "movq %%mm4, 8(%2) \n\t"
459 "movq %%mm1, (%2, %3) \n\t"
460 "movq %%mm5, 8(%2, %3) \n\t"
461 "add %%"REG_a", %1 \n\t"
462 "add %%"REG_a", %2 \n\t"
463 "movq (%1), %%mm0 \n\t"
464 "movq 8(%1), %%mm4 \n\t"
465 "movq (%1, %3), %%mm1 \n\t"
466 "movq 8(%1, %3), %%mm5 \n\t"
467 "movq %%mm0, (%2) \n\t"
468 "movq %%mm4, 8(%2) \n\t"
469 "movq %%mm1, (%2, %3) \n\t"
470 "movq %%mm5, 8(%2, %3) \n\t"
471 "add %%"REG_a", %1 \n\t"
472 "add %%"REG_a", %2 \n\t"
475 : "+g"(h), "+r" (pixels), "+r" (block)
476 : "r"((long)line_size)
481 static void clear_blocks_mmx(DCTELEM *blocks)
484 "pxor %%mm7, %%mm7 \n\t"
485 "mov $-128*6, %%"REG_a" \n\t"
487 "movq %%mm7, (%0, %%"REG_a") \n\t"
488 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
489 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
490 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
491 "add $32, %%"REG_a" \n\t"
493 : : "r" (((uint8_t *)blocks)+128*6)
498 #ifdef CONFIG_ENCODERS
499 static int pix_sum16_mmx(uint8_t * pix, int line_size){
502 long index= -line_size*h;
505 "pxor %%mm7, %%mm7 \n\t"
506 "pxor %%mm6, %%mm6 \n\t"
508 "movq (%2, %1), %%mm0 \n\t"
509 "movq (%2, %1), %%mm1 \n\t"
510 "movq 8(%2, %1), %%mm2 \n\t"
511 "movq 8(%2, %1), %%mm3 \n\t"
512 "punpcklbw %%mm7, %%mm0 \n\t"
513 "punpckhbw %%mm7, %%mm1 \n\t"
514 "punpcklbw %%mm7, %%mm2 \n\t"
515 "punpckhbw %%mm7, %%mm3 \n\t"
516 "paddw %%mm0, %%mm1 \n\t"
517 "paddw %%mm2, %%mm3 \n\t"
518 "paddw %%mm1, %%mm3 \n\t"
519 "paddw %%mm3, %%mm6 \n\t"
522 "movq %%mm6, %%mm5 \n\t"
523 "psrlq $32, %%mm6 \n\t"
524 "paddw %%mm5, %%mm6 \n\t"
525 "movq %%mm6, %%mm5 \n\t"
526 "psrlq $16, %%mm6 \n\t"
527 "paddw %%mm5, %%mm6 \n\t"
528 "movd %%mm6, %0 \n\t"
529 "andl $0xFFFF, %0 \n\t"
530 : "=&r" (sum), "+r" (index)
531 : "r" (pix - index), "r" ((long)line_size)
536 #endif //CONFIG_ENCODERS
538 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
542 "movq (%1, %0), %%mm0 \n\t"
543 "movq (%2, %0), %%mm1 \n\t"
544 "paddb %%mm0, %%mm1 \n\t"
545 "movq %%mm1, (%2, %0) \n\t"
546 "movq 8(%1, %0), %%mm0 \n\t"
547 "movq 8(%2, %0), %%mm1 \n\t"
548 "paddb %%mm0, %%mm1 \n\t"
549 "movq %%mm1, 8(%2, %0) \n\t"
554 : "r"(src), "r"(dst), "r"((long)w-15)
557 dst[i+0] += src[i+0];
560 #define H263_LOOP_FILTER \
561 "pxor %%mm7, %%mm7 \n\t"\
562 "movq %0, %%mm0 \n\t"\
563 "movq %0, %%mm1 \n\t"\
564 "movq %3, %%mm2 \n\t"\
565 "movq %3, %%mm3 \n\t"\
566 "punpcklbw %%mm7, %%mm0 \n\t"\
567 "punpckhbw %%mm7, %%mm1 \n\t"\
568 "punpcklbw %%mm7, %%mm2 \n\t"\
569 "punpckhbw %%mm7, %%mm3 \n\t"\
570 "psubw %%mm2, %%mm0 \n\t"\
571 "psubw %%mm3, %%mm1 \n\t"\
572 "movq %1, %%mm2 \n\t"\
573 "movq %1, %%mm3 \n\t"\
574 "movq %2, %%mm4 \n\t"\
575 "movq %2, %%mm5 \n\t"\
576 "punpcklbw %%mm7, %%mm2 \n\t"\
577 "punpckhbw %%mm7, %%mm3 \n\t"\
578 "punpcklbw %%mm7, %%mm4 \n\t"\
579 "punpckhbw %%mm7, %%mm5 \n\t"\
580 "psubw %%mm2, %%mm4 \n\t"\
581 "psubw %%mm3, %%mm5 \n\t"\
582 "psllw $2, %%mm4 \n\t"\
583 "psllw $2, %%mm5 \n\t"\
584 "paddw %%mm0, %%mm4 \n\t"\
585 "paddw %%mm1, %%mm5 \n\t"\
586 "pxor %%mm6, %%mm6 \n\t"\
587 "pcmpgtw %%mm4, %%mm6 \n\t"\
588 "pcmpgtw %%mm5, %%mm7 \n\t"\
589 "pxor %%mm6, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm5 \n\t"\
591 "psubw %%mm6, %%mm4 \n\t"\
592 "psubw %%mm7, %%mm5 \n\t"\
593 "psrlw $3, %%mm4 \n\t"\
594 "psrlw $3, %%mm5 \n\t"\
595 "packuswb %%mm5, %%mm4 \n\t"\
596 "packsswb %%mm7, %%mm6 \n\t"\
597 "pxor %%mm7, %%mm7 \n\t"\
598 "movd %4, %%mm2 \n\t"\
599 "punpcklbw %%mm2, %%mm2 \n\t"\
600 "punpcklbw %%mm2, %%mm2 \n\t"\
601 "punpcklbw %%mm2, %%mm2 \n\t"\
602 "psubusb %%mm4, %%mm2 \n\t"\
603 "movq %%mm2, %%mm3 \n\t"\
604 "psubusb %%mm4, %%mm3 \n\t"\
605 "psubb %%mm3, %%mm2 \n\t"\
606 "movq %1, %%mm3 \n\t"\
607 "movq %2, %%mm4 \n\t"\
608 "pxor %%mm6, %%mm3 \n\t"\
609 "pxor %%mm6, %%mm4 \n\t"\
610 "paddusb %%mm2, %%mm3 \n\t"\
611 "psubusb %%mm2, %%mm4 \n\t"\
612 "pxor %%mm6, %%mm3 \n\t"\
613 "pxor %%mm6, %%mm4 \n\t"\
614 "paddusb %%mm2, %%mm2 \n\t"\
615 "packsswb %%mm1, %%mm0 \n\t"\
616 "pcmpgtb %%mm0, %%mm7 \n\t"\
617 "pxor %%mm7, %%mm0 \n\t"\
618 "psubb %%mm7, %%mm0 \n\t"\
619 "movq %%mm0, %%mm1 \n\t"\
620 "psubusb %%mm2, %%mm0 \n\t"\
621 "psubb %%mm0, %%mm1 \n\t"\
622 "pand %5, %%mm1 \n\t"\
623 "psrlw $2, %%mm1 \n\t"\
624 "pxor %%mm7, %%mm1 \n\t"\
625 "psubb %%mm7, %%mm1 \n\t"\
626 "movq %0, %%mm5 \n\t"\
627 "movq %3, %%mm6 \n\t"\
628 "psubb %%mm1, %%mm5 \n\t"\
629 "paddb %%mm1, %%mm6 \n\t"
631 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
632 if(ENABLE_ANY_H263) {
633 const int strength= ff_h263_loop_filter_strength[qscale];
639 "movq %%mm3, %1 \n\t"
640 "movq %%mm4, %2 \n\t"
641 "movq %%mm5, %0 \n\t"
642 "movq %%mm6, %3 \n\t"
643 : "+m" (*(uint64_t*)(src - 2*stride)),
644 "+m" (*(uint64_t*)(src - 1*stride)),
645 "+m" (*(uint64_t*)(src + 0*stride)),
646 "+m" (*(uint64_t*)(src + 1*stride))
647 : "g" (2*strength), "m"(ff_pb_FC)
652 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
653 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
654 "movd %4, %%mm0 \n\t"
655 "movd %5, %%mm1 \n\t"
656 "movd %6, %%mm2 \n\t"
657 "movd %7, %%mm3 \n\t"
658 "punpcklbw %%mm1, %%mm0 \n\t"
659 "punpcklbw %%mm3, %%mm2 \n\t"
660 "movq %%mm0, %%mm1 \n\t"
661 "punpcklwd %%mm2, %%mm0 \n\t"
662 "punpckhwd %%mm2, %%mm1 \n\t"
663 "movd %%mm0, %0 \n\t"
664 "punpckhdq %%mm0, %%mm0 \n\t"
665 "movd %%mm0, %1 \n\t"
666 "movd %%mm1, %2 \n\t"
667 "punpckhdq %%mm1, %%mm1 \n\t"
668 "movd %%mm1, %3 \n\t"
670 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
671 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
672 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
673 "=m" (*(uint32_t*)(dst + 3*dst_stride))
674 : "m" (*(uint32_t*)(src + 0*src_stride)),
675 "m" (*(uint32_t*)(src + 1*src_stride)),
676 "m" (*(uint32_t*)(src + 2*src_stride)),
677 "m" (*(uint32_t*)(src + 3*src_stride))
681 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
682 if(ENABLE_ANY_H263) {
683 const int strength= ff_h263_loop_filter_strength[qscale];
684 DECLARE_ALIGNED(8, uint64_t, temp[4]);
685 uint8_t *btemp= (uint8_t*)temp;
689 transpose4x4(btemp , src , 8, stride);
690 transpose4x4(btemp+4, src + 4*stride, 8, stride);
692 H263_LOOP_FILTER // 5 3 4 6
698 : "g" (2*strength), "m"(ff_pb_FC)
702 "movq %%mm5, %%mm1 \n\t"
703 "movq %%mm4, %%mm0 \n\t"
704 "punpcklbw %%mm3, %%mm5 \n\t"
705 "punpcklbw %%mm6, %%mm4 \n\t"
706 "punpckhbw %%mm3, %%mm1 \n\t"
707 "punpckhbw %%mm6, %%mm0 \n\t"
708 "movq %%mm5, %%mm3 \n\t"
709 "movq %%mm1, %%mm6 \n\t"
710 "punpcklwd %%mm4, %%mm5 \n\t"
711 "punpcklwd %%mm0, %%mm1 \n\t"
712 "punpckhwd %%mm4, %%mm3 \n\t"
713 "punpckhwd %%mm0, %%mm6 \n\t"
714 "movd %%mm5, (%0) \n\t"
715 "punpckhdq %%mm5, %%mm5 \n\t"
716 "movd %%mm5, (%0,%2) \n\t"
717 "movd %%mm3, (%0,%2,2) \n\t"
718 "punpckhdq %%mm3, %%mm3 \n\t"
719 "movd %%mm3, (%0,%3) \n\t"
720 "movd %%mm1, (%1) \n\t"
721 "punpckhdq %%mm1, %%mm1 \n\t"
722 "movd %%mm1, (%1,%2) \n\t"
723 "movd %%mm6, (%1,%2,2) \n\t"
724 "punpckhdq %%mm6, %%mm6 \n\t"
725 "movd %%mm6, (%1,%3) \n\t"
727 "r" (src + 4*stride),
728 "r" ((long) stride ),
729 "r" ((long)(3*stride))
734 #ifdef CONFIG_ENCODERS
735 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
742 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
743 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
745 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
747 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
748 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
750 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
751 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
752 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
754 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
755 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
757 "pmaddwd %%mm3,%%mm3\n"
758 "pmaddwd %%mm4,%%mm4\n"
760 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
761 pix2^2+pix3^2+pix6^2+pix7^2) */
762 "paddd %%mm3,%%mm4\n"
763 "paddd %%mm2,%%mm7\n"
766 "paddd %%mm4,%%mm7\n"
771 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
772 "paddd %%mm7,%%mm1\n"
774 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
778 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
783 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
784 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
786 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
787 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
788 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
789 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
791 /* todo: mm1-mm2, mm3-mm4 */
792 /* algo: subtract mm1 from mm2 with saturation and vice versa */
793 /* OR the results to get absolute difference */
796 "psubusb %%mm2,%%mm1\n"
797 "psubusb %%mm4,%%mm3\n"
798 "psubusb %%mm5,%%mm2\n"
799 "psubusb %%mm6,%%mm4\n"
804 /* now convert to 16-bit vectors so we can square them */
808 "punpckhbw %%mm0,%%mm2\n"
809 "punpckhbw %%mm0,%%mm4\n"
810 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
811 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
813 "pmaddwd %%mm2,%%mm2\n"
814 "pmaddwd %%mm4,%%mm4\n"
815 "pmaddwd %%mm1,%%mm1\n"
816 "pmaddwd %%mm3,%%mm3\n"
818 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
819 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
821 "paddd %%mm2,%%mm1\n"
822 "paddd %%mm4,%%mm3\n"
823 "paddd %%mm1,%%mm7\n"
824 "paddd %%mm3,%%mm7\n"
830 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
831 "paddd %%mm7,%%mm1\n"
833 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
834 : "r" ((long)line_size) , "m" (h)
839 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
843 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
844 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
846 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
847 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
848 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
849 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
851 /* todo: mm1-mm2, mm3-mm4 */
852 /* algo: subtract mm1 from mm2 with saturation and vice versa */
853 /* OR the results to get absolute difference */
856 "psubusb %%mm2,%%mm1\n"
857 "psubusb %%mm4,%%mm3\n"
858 "psubusb %%mm5,%%mm2\n"
859 "psubusb %%mm6,%%mm4\n"
864 /* now convert to 16-bit vectors so we can square them */
868 "punpckhbw %%mm0,%%mm2\n"
869 "punpckhbw %%mm0,%%mm4\n"
870 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
871 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
873 "pmaddwd %%mm2,%%mm2\n"
874 "pmaddwd %%mm4,%%mm4\n"
875 "pmaddwd %%mm1,%%mm1\n"
876 "pmaddwd %%mm3,%%mm3\n"
881 "paddd %%mm2,%%mm1\n"
882 "paddd %%mm4,%%mm3\n"
883 "paddd %%mm1,%%mm7\n"
884 "paddd %%mm3,%%mm7\n"
890 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
891 "paddd %%mm7,%%mm1\n"
893 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
894 : "r" ((long)line_size) , "m" (h)
899 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
903 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
904 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
906 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
907 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
908 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
909 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
911 /* todo: mm1-mm2, mm3-mm4 */
912 /* algo: subtract mm1 from mm2 with saturation and vice versa */
913 /* OR the results to get absolute difference */
914 "movdqa %%xmm1,%%xmm5\n"
915 "movdqa %%xmm3,%%xmm6\n"
916 "psubusb %%xmm2,%%xmm1\n"
917 "psubusb %%xmm4,%%xmm3\n"
918 "psubusb %%xmm5,%%xmm2\n"
919 "psubusb %%xmm6,%%xmm4\n"
921 "por %%xmm1,%%xmm2\n"
922 "por %%xmm3,%%xmm4\n"
924 /* now convert to 16-bit vectors so we can square them */
925 "movdqa %%xmm2,%%xmm1\n"
926 "movdqa %%xmm4,%%xmm3\n"
928 "punpckhbw %%xmm0,%%xmm2\n"
929 "punpckhbw %%xmm0,%%xmm4\n"
930 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
931 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
933 "pmaddwd %%xmm2,%%xmm2\n"
934 "pmaddwd %%xmm4,%%xmm4\n"
935 "pmaddwd %%xmm1,%%xmm1\n"
936 "pmaddwd %%xmm3,%%xmm3\n"
938 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
939 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
941 "paddd %%xmm2,%%xmm1\n"
942 "paddd %%xmm4,%%xmm3\n"
943 "paddd %%xmm1,%%xmm7\n"
944 "paddd %%xmm3,%%xmm7\n"
949 "movdqa %%xmm7,%%xmm1\n"
950 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
951 "paddd %%xmm1,%%xmm7\n"
952 "movdqa %%xmm7,%%xmm1\n"
953 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
954 "paddd %%xmm1,%%xmm7\n"
956 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
957 : "r" ((long)line_size));
961 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
969 "movq %%mm0, %%mm1\n"
973 "movq %%mm0, %%mm2\n"
974 "movq %%mm1, %%mm3\n"
975 "punpcklbw %%mm7,%%mm0\n"
976 "punpcklbw %%mm7,%%mm1\n"
977 "punpckhbw %%mm7,%%mm2\n"
978 "punpckhbw %%mm7,%%mm3\n"
979 "psubw %%mm1, %%mm0\n"
980 "psubw %%mm3, %%mm2\n"
985 "movq %%mm4, %%mm1\n"
989 "movq %%mm4, %%mm5\n"
990 "movq %%mm1, %%mm3\n"
991 "punpcklbw %%mm7,%%mm4\n"
992 "punpcklbw %%mm7,%%mm1\n"
993 "punpckhbw %%mm7,%%mm5\n"
994 "punpckhbw %%mm7,%%mm3\n"
995 "psubw %%mm1, %%mm4\n"
996 "psubw %%mm3, %%mm5\n"
997 "psubw %%mm4, %%mm0\n"
998 "psubw %%mm5, %%mm2\n"
999 "pxor %%mm3, %%mm3\n"
1000 "pxor %%mm1, %%mm1\n"
1001 "pcmpgtw %%mm0, %%mm3\n\t"
1002 "pcmpgtw %%mm2, %%mm1\n\t"
1003 "pxor %%mm3, %%mm0\n"
1004 "pxor %%mm1, %%mm2\n"
1005 "psubw %%mm3, %%mm0\n"
1006 "psubw %%mm1, %%mm2\n"
1007 "paddw %%mm0, %%mm2\n"
1008 "paddw %%mm2, %%mm6\n"
1014 "movq %%mm0, %%mm1\n"
1018 "movq %%mm0, %%mm2\n"
1019 "movq %%mm1, %%mm3\n"
1020 "punpcklbw %%mm7,%%mm0\n"
1021 "punpcklbw %%mm7,%%mm1\n"
1022 "punpckhbw %%mm7,%%mm2\n"
1023 "punpckhbw %%mm7,%%mm3\n"
1024 "psubw %%mm1, %%mm0\n"
1025 "psubw %%mm3, %%mm2\n"
1026 "psubw %%mm0, %%mm4\n"
1027 "psubw %%mm2, %%mm5\n"
1028 "pxor %%mm3, %%mm3\n"
1029 "pxor %%mm1, %%mm1\n"
1030 "pcmpgtw %%mm4, %%mm3\n\t"
1031 "pcmpgtw %%mm5, %%mm1\n\t"
1032 "pxor %%mm3, %%mm4\n"
1033 "pxor %%mm1, %%mm5\n"
1034 "psubw %%mm3, %%mm4\n"
1035 "psubw %%mm1, %%mm5\n"
1036 "paddw %%mm4, %%mm5\n"
1037 "paddw %%mm5, %%mm6\n"
1042 "movq %%mm4, %%mm1\n"
1046 "movq %%mm4, %%mm5\n"
1047 "movq %%mm1, %%mm3\n"
1048 "punpcklbw %%mm7,%%mm4\n"
1049 "punpcklbw %%mm7,%%mm1\n"
1050 "punpckhbw %%mm7,%%mm5\n"
1051 "punpckhbw %%mm7,%%mm3\n"
1052 "psubw %%mm1, %%mm4\n"
1053 "psubw %%mm3, %%mm5\n"
1054 "psubw %%mm4, %%mm0\n"
1055 "psubw %%mm5, %%mm2\n"
1056 "pxor %%mm3, %%mm3\n"
1057 "pxor %%mm1, %%mm1\n"
1058 "pcmpgtw %%mm0, %%mm3\n\t"
1059 "pcmpgtw %%mm2, %%mm1\n\t"
1060 "pxor %%mm3, %%mm0\n"
1061 "pxor %%mm1, %%mm2\n"
1062 "psubw %%mm3, %%mm0\n"
1063 "psubw %%mm1, %%mm2\n"
1064 "paddw %%mm0, %%mm2\n"
1065 "paddw %%mm2, %%mm6\n"
1071 "movq %%mm6, %%mm0\n"
1072 "punpcklwd %%mm7,%%mm0\n"
1073 "punpckhwd %%mm7,%%mm6\n"
1074 "paddd %%mm0, %%mm6\n"
1076 "movq %%mm6,%%mm0\n"
1077 "psrlq $32, %%mm6\n"
1078 "paddd %%mm6,%%mm0\n"
1080 : "+r" (pix1), "=r"(tmp)
1081 : "r" ((long)line_size) , "g" (h-2)
1086 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1088 uint8_t * pix= pix1;
1091 "pxor %%mm7,%%mm7\n"
1092 "pxor %%mm6,%%mm6\n"
1095 "movq 1(%0),%%mm1\n"
1096 "movq %%mm0, %%mm2\n"
1097 "movq %%mm1, %%mm3\n"
1098 "punpcklbw %%mm7,%%mm0\n"
1099 "punpcklbw %%mm7,%%mm1\n"
1100 "punpckhbw %%mm7,%%mm2\n"
1101 "punpckhbw %%mm7,%%mm3\n"
1102 "psubw %%mm1, %%mm0\n"
1103 "psubw %%mm3, %%mm2\n"
1108 "movq 1(%0),%%mm1\n"
1109 "movq %%mm4, %%mm5\n"
1110 "movq %%mm1, %%mm3\n"
1111 "punpcklbw %%mm7,%%mm4\n"
1112 "punpcklbw %%mm7,%%mm1\n"
1113 "punpckhbw %%mm7,%%mm5\n"
1114 "punpckhbw %%mm7,%%mm3\n"
1115 "psubw %%mm1, %%mm4\n"
1116 "psubw %%mm3, %%mm5\n"
1117 "psubw %%mm4, %%mm0\n"
1118 "psubw %%mm5, %%mm2\n"
1119 "pxor %%mm3, %%mm3\n"
1120 "pxor %%mm1, %%mm1\n"
1121 "pcmpgtw %%mm0, %%mm3\n\t"
1122 "pcmpgtw %%mm2, %%mm1\n\t"
1123 "pxor %%mm3, %%mm0\n"
1124 "pxor %%mm1, %%mm2\n"
1125 "psubw %%mm3, %%mm0\n"
1126 "psubw %%mm1, %%mm2\n"
1127 "paddw %%mm0, %%mm2\n"
1128 "paddw %%mm2, %%mm6\n"
1134 "movq 1(%0),%%mm1\n"
1135 "movq %%mm0, %%mm2\n"
1136 "movq %%mm1, %%mm3\n"
1137 "punpcklbw %%mm7,%%mm0\n"
1138 "punpcklbw %%mm7,%%mm1\n"
1139 "punpckhbw %%mm7,%%mm2\n"
1140 "punpckhbw %%mm7,%%mm3\n"
1141 "psubw %%mm1, %%mm0\n"
1142 "psubw %%mm3, %%mm2\n"
1143 "psubw %%mm0, %%mm4\n"
1144 "psubw %%mm2, %%mm5\n"
1145 "pxor %%mm3, %%mm3\n"
1146 "pxor %%mm1, %%mm1\n"
1147 "pcmpgtw %%mm4, %%mm3\n\t"
1148 "pcmpgtw %%mm5, %%mm1\n\t"
1149 "pxor %%mm3, %%mm4\n"
1150 "pxor %%mm1, %%mm5\n"
1151 "psubw %%mm3, %%mm4\n"
1152 "psubw %%mm1, %%mm5\n"
1153 "paddw %%mm4, %%mm5\n"
1154 "paddw %%mm5, %%mm6\n"
1159 "movq 1(%0),%%mm1\n"
1160 "movq %%mm4, %%mm5\n"
1161 "movq %%mm1, %%mm3\n"
1162 "punpcklbw %%mm7,%%mm4\n"
1163 "punpcklbw %%mm7,%%mm1\n"
1164 "punpckhbw %%mm7,%%mm5\n"
1165 "punpckhbw %%mm7,%%mm3\n"
1166 "psubw %%mm1, %%mm4\n"
1167 "psubw %%mm3, %%mm5\n"
1168 "psubw %%mm4, %%mm0\n"
1169 "psubw %%mm5, %%mm2\n"
1170 "pxor %%mm3, %%mm3\n"
1171 "pxor %%mm1, %%mm1\n"
1172 "pcmpgtw %%mm0, %%mm3\n\t"
1173 "pcmpgtw %%mm2, %%mm1\n\t"
1174 "pxor %%mm3, %%mm0\n"
1175 "pxor %%mm1, %%mm2\n"
1176 "psubw %%mm3, %%mm0\n"
1177 "psubw %%mm1, %%mm2\n"
1178 "paddw %%mm0, %%mm2\n"
1179 "paddw %%mm2, %%mm6\n"
1185 "movq %%mm6, %%mm0\n"
1186 "punpcklwd %%mm7,%%mm0\n"
1187 "punpckhwd %%mm7,%%mm6\n"
1188 "paddd %%mm0, %%mm6\n"
1190 "movq %%mm6,%%mm0\n"
1191 "psrlq $32, %%mm6\n"
1192 "paddd %%mm6,%%mm0\n"
1194 : "+r" (pix1), "=r"(tmp)
1195 : "r" ((long)line_size) , "g" (h-2)
1197 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1200 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1201 MpegEncContext *c = p;
1204 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1205 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1206 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1208 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1209 else return score1 + FFABS(score2)*8;
1212 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1213 MpegEncContext *c = p;
1214 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1215 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1217 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1218 else return score1 + FFABS(score2)*8;
1221 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1224 assert( (((int)pix) & 7) == 0);
1225 assert((line_size &7) ==0);
1227 #define SUM(in0, in1, out0, out1) \
1228 "movq (%0), %%mm2\n"\
1229 "movq 8(%0), %%mm3\n"\
1231 "movq %%mm2, " #out0 "\n"\
1232 "movq %%mm3, " #out1 "\n"\
1233 "psubusb " #in0 ", %%mm2\n"\
1234 "psubusb " #in1 ", %%mm3\n"\
1235 "psubusb " #out0 ", " #in0 "\n"\
1236 "psubusb " #out1 ", " #in1 "\n"\
1237 "por %%mm2, " #in0 "\n"\
1238 "por %%mm3, " #in1 "\n"\
1239 "movq " #in0 ", %%mm2\n"\
1240 "movq " #in1 ", %%mm3\n"\
1241 "punpcklbw %%mm7, " #in0 "\n"\
1242 "punpcklbw %%mm7, " #in1 "\n"\
1243 "punpckhbw %%mm7, %%mm2\n"\
1244 "punpckhbw %%mm7, %%mm3\n"\
1245 "paddw " #in1 ", " #in0 "\n"\
1246 "paddw %%mm3, %%mm2\n"\
1247 "paddw %%mm2, " #in0 "\n"\
1248 "paddw " #in0 ", %%mm6\n"
1253 "pxor %%mm6,%%mm6\n"
1254 "pxor %%mm7,%%mm7\n"
1256 "movq 8(%0),%%mm1\n"
1259 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1262 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1264 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1269 "movq %%mm6,%%mm0\n"
1270 "psrlq $32, %%mm6\n"
1271 "paddw %%mm6,%%mm0\n"
1272 "movq %%mm0,%%mm6\n"
1273 "psrlq $16, %%mm0\n"
1274 "paddw %%mm6,%%mm0\n"
1276 : "+r" (pix), "=r"(tmp)
1277 : "r" ((long)line_size) , "m" (h)
1279 return tmp & 0xFFFF;
1283 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1286 assert( (((int)pix) & 7) == 0);
1287 assert((line_size &7) ==0);
1289 #define SUM(in0, in1, out0, out1) \
1290 "movq (%0), " #out0 "\n"\
1291 "movq 8(%0), " #out1 "\n"\
1293 "psadbw " #out0 ", " #in0 "\n"\
1294 "psadbw " #out1 ", " #in1 "\n"\
1295 "paddw " #in1 ", " #in0 "\n"\
1296 "paddw " #in0 ", %%mm6\n"
1300 "pxor %%mm6,%%mm6\n"
1301 "pxor %%mm7,%%mm7\n"
1303 "movq 8(%0),%%mm1\n"
1306 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1309 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1311 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1317 : "+r" (pix), "=r"(tmp)
1318 : "r" ((long)line_size) , "m" (h)
1324 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1327 assert( (((int)pix1) & 7) == 0);
1328 assert( (((int)pix2) & 7) == 0);
1329 assert((line_size &7) ==0);
1331 #define SUM(in0, in1, out0, out1) \
1332 "movq (%0),%%mm2\n"\
1333 "movq (%1)," #out0 "\n"\
1334 "movq 8(%0),%%mm3\n"\
1335 "movq 8(%1)," #out1 "\n"\
1338 "psubb " #out0 ", %%mm2\n"\
1339 "psubb " #out1 ", %%mm3\n"\
1340 "pxor %%mm7, %%mm2\n"\
1341 "pxor %%mm7, %%mm3\n"\
1342 "movq %%mm2, " #out0 "\n"\
1343 "movq %%mm3, " #out1 "\n"\
1344 "psubusb " #in0 ", %%mm2\n"\
1345 "psubusb " #in1 ", %%mm3\n"\
1346 "psubusb " #out0 ", " #in0 "\n"\
1347 "psubusb " #out1 ", " #in1 "\n"\
1348 "por %%mm2, " #in0 "\n"\
1349 "por %%mm3, " #in1 "\n"\
1350 "movq " #in0 ", %%mm2\n"\
1351 "movq " #in1 ", %%mm3\n"\
1352 "punpcklbw %%mm7, " #in0 "\n"\
1353 "punpcklbw %%mm7, " #in1 "\n"\
1354 "punpckhbw %%mm7, %%mm2\n"\
1355 "punpckhbw %%mm7, %%mm3\n"\
1356 "paddw " #in1 ", " #in0 "\n"\
1357 "paddw %%mm3, %%mm2\n"\
1358 "paddw %%mm2, " #in0 "\n"\
1359 "paddw " #in0 ", %%mm6\n"
1364 "pxor %%mm6,%%mm6\n"
1365 "pcmpeqw %%mm7,%%mm7\n"
1366 "psllw $15, %%mm7\n"
1367 "packsswb %%mm7, %%mm7\n"
1370 "movq 8(%0),%%mm1\n"
1371 "movq 8(%1),%%mm3\n"
1375 "psubb %%mm2, %%mm0\n"
1376 "psubb %%mm3, %%mm1\n"
1377 "pxor %%mm7, %%mm0\n"
1378 "pxor %%mm7, %%mm1\n"
1379 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1382 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1384 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1389 "movq %%mm6,%%mm0\n"
1390 "psrlq $32, %%mm6\n"
1391 "paddw %%mm6,%%mm0\n"
1392 "movq %%mm0,%%mm6\n"
1393 "psrlq $16, %%mm0\n"
1394 "paddw %%mm6,%%mm0\n"
1396 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1397 : "r" ((long)line_size) , "m" (h)
1399 return tmp & 0x7FFF;
1403 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1406 assert( (((int)pix1) & 7) == 0);
1407 assert( (((int)pix2) & 7) == 0);
1408 assert((line_size &7) ==0);
1410 #define SUM(in0, in1, out0, out1) \
1411 "movq (%0)," #out0 "\n"\
1412 "movq (%1),%%mm2\n"\
1413 "movq 8(%0)," #out1 "\n"\
1414 "movq 8(%1),%%mm3\n"\
1417 "psubb %%mm2, " #out0 "\n"\
1418 "psubb %%mm3, " #out1 "\n"\
1419 "pxor %%mm7, " #out0 "\n"\
1420 "pxor %%mm7, " #out1 "\n"\
1421 "psadbw " #out0 ", " #in0 "\n"\
1422 "psadbw " #out1 ", " #in1 "\n"\
1423 "paddw " #in1 ", " #in0 "\n"\
1424 "paddw " #in0 ", %%mm6\n"
1428 "pxor %%mm6,%%mm6\n"
1429 "pcmpeqw %%mm7,%%mm7\n"
1430 "psllw $15, %%mm7\n"
1431 "packsswb %%mm7, %%mm7\n"
1434 "movq 8(%0),%%mm1\n"
1435 "movq 8(%1),%%mm3\n"
1439 "psubb %%mm2, %%mm0\n"
1440 "psubb %%mm3, %%mm1\n"
1441 "pxor %%mm7, %%mm0\n"
1442 "pxor %%mm7, %%mm1\n"
1443 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1446 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1448 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1454 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1455 : "r" ((long)line_size) , "m" (h)
1461 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1465 "movq (%2, %0), %%mm0 \n\t"
1466 "movq (%1, %0), %%mm1 \n\t"
1467 "psubb %%mm0, %%mm1 \n\t"
1468 "movq %%mm1, (%3, %0) \n\t"
1469 "movq 8(%2, %0), %%mm0 \n\t"
1470 "movq 8(%1, %0), %%mm1 \n\t"
1471 "psubb %%mm0, %%mm1 \n\t"
1472 "movq %%mm1, 8(%3, %0) \n\t"
1477 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1480 dst[i+0] = src1[i+0]-src2[i+0];
1483 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1489 "movq -1(%1, %0), %%mm0 \n\t" // LT
1490 "movq (%1, %0), %%mm1 \n\t" // T
1491 "movq -1(%2, %0), %%mm2 \n\t" // L
1492 "movq (%2, %0), %%mm3 \n\t" // X
1493 "movq %%mm2, %%mm4 \n\t" // L
1494 "psubb %%mm0, %%mm2 \n\t"
1495 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1496 "movq %%mm4, %%mm5 \n\t" // L
1497 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1498 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1499 "pminub %%mm2, %%mm4 \n\t"
1500 "pmaxub %%mm1, %%mm4 \n\t"
1501 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1502 "movq %%mm3, (%3, %0) \n\t"
1507 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1513 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1515 *left_top= src1[w-1];
1519 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1520 "mov"#m" "#p1", "#a" \n\t"\
1521 "mov"#m" "#p2", "#t" \n\t"\
1522 "punpcklbw "#a", "#t" \n\t"\
1523 "punpcklbw "#a", "#a" \n\t"\
1524 "psubw "#t", "#a" \n\t"\
1526 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1527 uint8_t *p1b=p1, *p2b=p2;\
1529 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1530 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1531 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1534 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1535 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1536 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1537 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1538 "mov"#m1" "#mm"0, %0 \n\t"\
1539 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1540 "mov"#m1" %0, "#mm"0 \n\t"\
1541 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1542 : "r"((long)stride), "r"((long)stride*3)\
1545 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1547 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1548 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1550 #define LBUTTERFLY2(a1,b1,a2,b2)\
1551 "paddw " #b1 ", " #a1 " \n\t"\
1552 "paddw " #b2 ", " #a2 " \n\t"\
1553 "paddw " #b1 ", " #b1 " \n\t"\
1554 "paddw " #b2 ", " #b2 " \n\t"\
1555 "psubw " #a1 ", " #b1 " \n\t"\
1556 "psubw " #a2 ", " #b2 " \n\t"
1558 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1559 LBUTTERFLY2(m0, m1, m2, m3)\
1560 LBUTTERFLY2(m4, m5, m6, m7)\
1561 LBUTTERFLY2(m0, m2, m1, m3)\
1562 LBUTTERFLY2(m4, m6, m5, m7)\
1563 LBUTTERFLY2(m0, m4, m1, m5)\
1564 LBUTTERFLY2(m2, m6, m3, m7)\
1566 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1568 #define MMABS_MMX(a,z)\
1569 "pxor " #z ", " #z " \n\t"\
1570 "pcmpgtw " #a ", " #z " \n\t"\
1571 "pxor " #z ", " #a " \n\t"\
1572 "psubw " #z ", " #a " \n\t"
1574 #define MMABS_MMX2(a,z)\
1575 "pxor " #z ", " #z " \n\t"\
1576 "psubw " #a ", " #z " \n\t"\
1577 "pmaxsw " #z ", " #a " \n\t"
1579 #define MMABS_SSSE3(a,z)\
1580 "pabsw " #a ", " #a " \n\t"
1582 #define MMABS_SUM(a,z, sum)\
1584 "paddusw " #a ", " #sum " \n\t"
1586 #define MMABS_SUM_8x8_NOSPILL\
1587 MMABS(%%xmm0, %%xmm8)\
1588 MMABS(%%xmm1, %%xmm9)\
1589 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1590 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1591 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1592 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1593 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1594 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1595 "paddusw %%xmm1, %%xmm0 \n\t"
1598 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1600 #define MMABS_SUM_8x8_SSE2\
1601 "movdqa %%xmm7, (%1) \n\t"\
1602 MMABS(%%xmm0, %%xmm7)\
1603 MMABS(%%xmm1, %%xmm7)\
1604 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1605 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1606 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1607 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1608 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1609 "movdqa (%1), %%xmm2 \n\t"\
1610 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1611 "paddusw %%xmm1, %%xmm0 \n\t"
1614 #define LOAD4(o, a, b, c, d)\
1615 "movq "#o"(%1), "#a" \n\t"\
1616 "movq "#o"+8(%1), "#b" \n\t"\
1617 "movq "#o"+16(%1), "#c" \n\t"\
1618 "movq "#o"+24(%1), "#d" \n\t"\
1620 #define STORE4(o, a, b, c, d)\
1621 "movq "#a", "#o"(%1) \n\t"\
1622 "movq "#b", "#o"+8(%1) \n\t"\
1623 "movq "#c", "#o"+16(%1) \n\t"\
1624 "movq "#d", "#o"+24(%1) \n\t"\
1626 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1627 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1628 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1629 #define HSUM_MMX(a, t, dst)\
1630 "movq "#a", "#t" \n\t"\
1631 "psrlq $32, "#a" \n\t"\
1632 "paddusw "#t", "#a" \n\t"\
1633 "movq "#a", "#t" \n\t"\
1634 "psrlq $16, "#a" \n\t"\
1635 "paddusw "#t", "#a" \n\t"\
1636 "movd "#a", "#dst" \n\t"\
1638 #define HSUM_MMX2(a, t, dst)\
1639 "pshufw $0x0E, "#a", "#t" \n\t"\
1640 "paddusw "#t", "#a" \n\t"\
1641 "pshufw $0x01, "#a", "#t" \n\t"\
1642 "paddusw "#t", "#a" \n\t"\
1643 "movd "#a", "#dst" \n\t"\
1645 #define HSUM_SSE2(a, t, dst)\
1646 "movhlps "#a", "#t" \n\t"\
1647 "paddusw "#t", "#a" \n\t"\
1648 "pshuflw $0x0E, "#a", "#t" \n\t"\
1649 "paddusw "#t", "#a" \n\t"\
1650 "pshuflw $0x01, "#a", "#t" \n\t"\
1651 "paddusw "#t", "#a" \n\t"\
1652 "movd "#a", "#dst" \n\t"\
1654 #define HADAMARD8_DIFF_MMX(cpu) \
1655 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1656 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1661 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1666 "movq %%mm7, 96(%1) \n\t"\
1668 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1669 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1671 "movq 96(%1), %%mm7 \n\t"\
1672 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1673 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1679 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1684 "movq %%mm7, 96(%1) \n\t"\
1686 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1687 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1689 "movq 96(%1), %%mm7 \n\t"\
1690 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1691 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1692 "movq %%mm6, %%mm7 \n\t"\
1693 "movq %%mm0, %%mm6 \n\t"\
1695 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1698 "movq %%mm7, 64(%1) \n\t"\
1699 MMABS(%%mm0, %%mm7)\
1700 MMABS(%%mm1, %%mm7)\
1701 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1702 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1703 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1704 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1705 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1706 "movq 64(%1), %%mm2 \n\t"\
1707 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1708 "paddusw %%mm1, %%mm0 \n\t"\
1709 "movq %%mm0, 64(%1) \n\t"\
1711 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1712 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1715 "movq %%mm7, (%1) \n\t"\
1716 MMABS(%%mm0, %%mm7)\
1717 MMABS(%%mm1, %%mm7)\
1718 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1719 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1720 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1721 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1722 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1723 "movq (%1), %%mm2 \n\t"\
1724 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1725 "paddusw 64(%1), %%mm0 \n\t"\
1726 "paddusw %%mm1, %%mm0 \n\t"\
1728 HSUM(%%mm0, %%mm1, %0)\
1735 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1737 #define HADAMARD8_DIFF_SSE2(cpu) \
1738 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1739 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1744 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1747 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1748 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1749 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1751 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1757 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1759 #define MMABS(a,z) MMABS_MMX(a,z)
1760 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1761 HADAMARD8_DIFF_MMX(mmx)
1765 #define MMABS(a,z) MMABS_MMX2(a,z)
1766 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1767 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1768 HADAMARD8_DIFF_MMX(mmx2)
1769 HADAMARD8_DIFF_SSE2(sse2)
1771 #undef MMABS_SUM_8x8
1775 #define MMABS(a,z) MMABS_SSSE3(a,z)
1776 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1777 HADAMARD8_DIFF_SSE2(ssse3)
1779 #undef MMABS_SUM_8x8
1782 #define DCT_SAD4(m,mm,o)\
1783 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1784 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1785 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1786 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1787 MMABS_SUM(mm##2, mm##6, mm##0)\
1788 MMABS_SUM(mm##3, mm##7, mm##1)\
1789 MMABS_SUM(mm##4, mm##6, mm##0)\
1790 MMABS_SUM(mm##5, mm##7, mm##1)\
1792 #define DCT_SAD_MMX\
1793 "pxor %%mm0, %%mm0 \n\t"\
1794 "pxor %%mm1, %%mm1 \n\t"\
1795 DCT_SAD4(q, %%mm, 0)\
1796 DCT_SAD4(q, %%mm, 8)\
1797 DCT_SAD4(q, %%mm, 64)\
1798 DCT_SAD4(q, %%mm, 72)\
1799 "paddusw %%mm1, %%mm0 \n\t"\
1800 HSUM(%%mm0, %%mm1, %0)
1802 #define DCT_SAD_SSE2\
1803 "pxor %%xmm0, %%xmm0 \n\t"\
1804 "pxor %%xmm1, %%xmm1 \n\t"\
1805 DCT_SAD4(dqa, %%xmm, 0)\
1806 DCT_SAD4(dqa, %%xmm, 64)\
1807 "paddusw %%xmm1, %%xmm0 \n\t"\
1808 HSUM(%%xmm0, %%xmm1, %0)
1810 #define DCT_SAD_FUNC(cpu) \
1811 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1821 #define DCT_SAD DCT_SAD_MMX
1822 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1823 #define MMABS(a,z) MMABS_MMX(a,z)
1828 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1829 #define MMABS(a,z) MMABS_MMX2(a,z)
1834 #define DCT_SAD DCT_SAD_SSE2
1835 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1840 #define MMABS(a,z) MMABS_SSSE3(a,z)
1847 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1851 "pxor %%mm4, %%mm4 \n"
1854 "movq (%2,%0), %%mm2 \n"
1855 "movq (%3,%0,2), %%mm0 \n"
1856 "movq 8(%3,%0,2), %%mm1 \n"
1857 "punpckhbw %%mm2, %%mm3 \n"
1858 "punpcklbw %%mm2, %%mm2 \n"
1859 "psraw $8, %%mm3 \n"
1860 "psraw $8, %%mm2 \n"
1861 "psubw %%mm3, %%mm1 \n"
1862 "psubw %%mm2, %%mm0 \n"
1863 "pmaddwd %%mm1, %%mm1 \n"
1864 "pmaddwd %%mm0, %%mm0 \n"
1865 "paddd %%mm1, %%mm4 \n"
1866 "paddd %%mm0, %%mm4 \n"
1868 "movq %%mm4, %%mm3 \n"
1869 "psrlq $32, %%mm3 \n"
1870 "paddd %%mm3, %%mm4 \n"
1873 :"r"(pix1), "r"(pix2)
1878 #endif //CONFIG_ENCODERS
1880 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1881 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1882 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1883 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1884 "movq "#in7", " #m3 " \n\t" /* d */\
1885 "movq "#in0", %%mm5 \n\t" /* D */\
1886 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1887 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1888 "movq "#in1", %%mm5 \n\t" /* C */\
1889 "movq "#in2", %%mm6 \n\t" /* B */\
1890 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1891 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1892 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1893 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1894 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1895 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1896 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1897 "psraw $5, %%mm5 \n\t"\
1898 "packuswb %%mm5, %%mm5 \n\t"\
1899 OP(%%mm5, out, %%mm7, d)
1901 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1902 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1906 "pxor %%mm7, %%mm7 \n\t"\
1908 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1909 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1910 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1911 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1912 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1913 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1914 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1915 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1916 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1917 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1918 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1919 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1920 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1921 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1922 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1923 "paddw %%mm3, %%mm5 \n\t" /* b */\
1924 "paddw %%mm2, %%mm6 \n\t" /* c */\
1925 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1926 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1927 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1928 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1929 "paddw %%mm4, %%mm0 \n\t" /* a */\
1930 "paddw %%mm1, %%mm5 \n\t" /* d */\
1931 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1932 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1933 "paddw %6, %%mm6 \n\t"\
1934 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1935 "psraw $5, %%mm0 \n\t"\
1936 "movq %%mm0, %5 \n\t"\
1937 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1939 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1940 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1941 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1942 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1943 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1944 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1945 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1946 "paddw %%mm0, %%mm2 \n\t" /* b */\
1947 "paddw %%mm5, %%mm3 \n\t" /* c */\
1948 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1949 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1950 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1951 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1952 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1953 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1954 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1955 "paddw %%mm2, %%mm1 \n\t" /* a */\
1956 "paddw %%mm6, %%mm4 \n\t" /* d */\
1957 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1958 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1959 "paddw %6, %%mm1 \n\t"\
1960 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1961 "psraw $5, %%mm3 \n\t"\
1962 "movq %5, %%mm1 \n\t"\
1963 "packuswb %%mm3, %%mm1 \n\t"\
1964 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1965 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1967 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1968 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1969 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1970 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1971 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1972 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1973 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1974 "paddw %%mm1, %%mm5 \n\t" /* b */\
1975 "paddw %%mm4, %%mm0 \n\t" /* c */\
1976 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1977 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1978 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1979 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1980 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1981 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1982 "paddw %%mm3, %%mm2 \n\t" /* d */\
1983 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1984 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1985 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1986 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1987 "paddw %%mm2, %%mm6 \n\t" /* a */\
1988 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1989 "paddw %6, %%mm0 \n\t"\
1990 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1991 "psraw $5, %%mm0 \n\t"\
1992 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1994 "paddw %%mm5, %%mm3 \n\t" /* a */\
1995 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1996 "paddw %%mm4, %%mm6 \n\t" /* b */\
1997 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1998 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1999 "paddw %%mm1, %%mm4 \n\t" /* c */\
2000 "paddw %%mm2, %%mm5 \n\t" /* d */\
2001 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2002 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2003 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2004 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2005 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2006 "paddw %6, %%mm4 \n\t"\
2007 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2008 "psraw $5, %%mm4 \n\t"\
2009 "packuswb %%mm4, %%mm0 \n\t"\
2010 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2016 : "+a"(src), "+c"(dst), "+m"(h)\
2017 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2022 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2025 /* quick HACK, XXX FIXME MUST be optimized */\
2028 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2029 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2030 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2031 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2032 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2033 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2034 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2035 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2036 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2037 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2038 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2039 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2040 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2041 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2042 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2043 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2045 "movq (%0), %%mm0 \n\t"\
2046 "movq 8(%0), %%mm1 \n\t"\
2047 "paddw %2, %%mm0 \n\t"\
2048 "paddw %2, %%mm1 \n\t"\
2049 "psraw $5, %%mm0 \n\t"\
2050 "psraw $5, %%mm1 \n\t"\
2051 "packuswb %%mm1, %%mm0 \n\t"\
2052 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2053 "movq 16(%0), %%mm0 \n\t"\
2054 "movq 24(%0), %%mm1 \n\t"\
2055 "paddw %2, %%mm0 \n\t"\
2056 "paddw %2, %%mm1 \n\t"\
2057 "psraw $5, %%mm0 \n\t"\
2058 "psraw $5, %%mm1 \n\t"\
2059 "packuswb %%mm1, %%mm0 \n\t"\
2060 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2061 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2069 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2073 "pxor %%mm7, %%mm7 \n\t"\
2075 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2076 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2077 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2078 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2079 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2080 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2081 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2082 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2083 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2084 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2085 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2086 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2087 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2088 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2089 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2090 "paddw %%mm3, %%mm5 \n\t" /* b */\
2091 "paddw %%mm2, %%mm6 \n\t" /* c */\
2092 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2093 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2094 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2095 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2096 "paddw %%mm4, %%mm0 \n\t" /* a */\
2097 "paddw %%mm1, %%mm5 \n\t" /* d */\
2098 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2099 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2100 "paddw %6, %%mm6 \n\t"\
2101 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2102 "psraw $5, %%mm0 \n\t"\
2103 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2105 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2106 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2107 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2108 "paddw %%mm5, %%mm1 \n\t" /* a */\
2109 "paddw %%mm6, %%mm2 \n\t" /* b */\
2110 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2111 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2112 "paddw %%mm6, %%mm3 \n\t" /* c */\
2113 "paddw %%mm5, %%mm4 \n\t" /* d */\
2114 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2115 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2116 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2117 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2118 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2119 "paddw %6, %%mm1 \n\t"\
2120 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2121 "psraw $5, %%mm3 \n\t"\
2122 "packuswb %%mm3, %%mm0 \n\t"\
2123 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2129 : "+a"(src), "+c"(dst), "+m"(h)\
2130 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2135 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2138 /* quick HACK, XXX FIXME MUST be optimized */\
2141 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2142 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2143 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2144 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2145 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2146 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2147 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2148 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2150 "movq (%0), %%mm0 \n\t"\
2151 "movq 8(%0), %%mm1 \n\t"\
2152 "paddw %2, %%mm0 \n\t"\
2153 "paddw %2, %%mm1 \n\t"\
2154 "psraw $5, %%mm0 \n\t"\
2155 "psraw $5, %%mm1 \n\t"\
2156 "packuswb %%mm1, %%mm0 \n\t"\
2157 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2158 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2166 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2168 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2169 uint64_t temp[17*4];\
2170 uint64_t *temp_ptr= temp;\
2175 "pxor %%mm7, %%mm7 \n\t"\
2177 "movq (%0), %%mm0 \n\t"\
2178 "movq (%0), %%mm1 \n\t"\
2179 "movq 8(%0), %%mm2 \n\t"\
2180 "movq 8(%0), %%mm3 \n\t"\
2181 "punpcklbw %%mm7, %%mm0 \n\t"\
2182 "punpckhbw %%mm7, %%mm1 \n\t"\
2183 "punpcklbw %%mm7, %%mm2 \n\t"\
2184 "punpckhbw %%mm7, %%mm3 \n\t"\
2185 "movq %%mm0, (%1) \n\t"\
2186 "movq %%mm1, 17*8(%1) \n\t"\
2187 "movq %%mm2, 2*17*8(%1) \n\t"\
2188 "movq %%mm3, 3*17*8(%1) \n\t"\
2193 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2194 : "r" ((long)srcStride)\
2201 /*FIXME reorder for speed */\
2203 /*"pxor %%mm7, %%mm7 \n\t"*/\
2205 "movq (%0), %%mm0 \n\t"\
2206 "movq 8(%0), %%mm1 \n\t"\
2207 "movq 16(%0), %%mm2 \n\t"\
2208 "movq 24(%0), %%mm3 \n\t"\
2209 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2210 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2212 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2214 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2216 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2217 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2219 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2220 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2222 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2223 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2225 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2226 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2228 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2230 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2232 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2233 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2235 "add $136, %0 \n\t"\
2240 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2241 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2246 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2247 uint64_t temp[9*2];\
2248 uint64_t *temp_ptr= temp;\
2253 "pxor %%mm7, %%mm7 \n\t"\
2255 "movq (%0), %%mm0 \n\t"\
2256 "movq (%0), %%mm1 \n\t"\
2257 "punpcklbw %%mm7, %%mm0 \n\t"\
2258 "punpckhbw %%mm7, %%mm1 \n\t"\
2259 "movq %%mm0, (%1) \n\t"\
2260 "movq %%mm1, 9*8(%1) \n\t"\
2265 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2266 : "r" ((long)srcStride)\
2273 /*FIXME reorder for speed */\
2275 /*"pxor %%mm7, %%mm7 \n\t"*/\
2277 "movq (%0), %%mm0 \n\t"\
2278 "movq 8(%0), %%mm1 \n\t"\
2279 "movq 16(%0), %%mm2 \n\t"\
2280 "movq 24(%0), %%mm3 \n\t"\
2281 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2282 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2284 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2286 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2288 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2290 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2292 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2293 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2300 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2301 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2306 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2307 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
2310 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2312 uint8_t * const half= (uint8_t*)temp;\
2313 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2314 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2317 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2318 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2321 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2323 uint8_t * const half= (uint8_t*)temp;\
2324 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2325 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2328 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2330 uint8_t * const half= (uint8_t*)temp;\
2331 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2332 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2335 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2336 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2339 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2341 uint8_t * const half= (uint8_t*)temp;\
2342 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2343 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2345 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2346 uint64_t half[8 + 9];\
2347 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2348 uint8_t * const halfHV= ((uint8_t*)half);\
2349 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2350 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2351 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2352 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2354 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2355 uint64_t half[8 + 9];\
2356 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2357 uint8_t * const halfHV= ((uint8_t*)half);\
2358 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2359 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2360 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2361 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2363 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2364 uint64_t half[8 + 9];\
2365 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2366 uint8_t * const halfHV= ((uint8_t*)half);\
2367 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2368 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2369 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2370 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2372 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2373 uint64_t half[8 + 9];\
2374 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2375 uint8_t * const halfHV= ((uint8_t*)half);\
2376 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2377 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2378 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2379 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2381 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2382 uint64_t half[8 + 9];\
2383 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2384 uint8_t * const halfHV= ((uint8_t*)half);\
2385 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2387 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2389 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2390 uint64_t half[8 + 9];\
2391 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2392 uint8_t * const halfHV= ((uint8_t*)half);\
2393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2394 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2395 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2397 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2398 uint64_t half[8 + 9];\
2399 uint8_t * const halfH= ((uint8_t*)half);\
2400 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2401 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2402 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2404 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2405 uint64_t half[8 + 9];\
2406 uint8_t * const halfH= ((uint8_t*)half);\
2407 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2408 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2409 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2411 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2413 uint8_t * const halfH= ((uint8_t*)half);\
2414 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2415 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2417 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2418 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
2421 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2423 uint8_t * const half= (uint8_t*)temp;\
2424 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2425 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2428 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2429 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2432 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2434 uint8_t * const half= (uint8_t*)temp;\
2435 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2436 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2439 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2441 uint8_t * const half= (uint8_t*)temp;\
2442 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2443 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2446 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2447 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2450 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2452 uint8_t * const half= (uint8_t*)temp;\
2453 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2454 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2456 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2457 uint64_t half[16*2 + 17*2];\
2458 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2459 uint8_t * const halfHV= ((uint8_t*)half);\
2460 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2461 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2462 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2463 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2465 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2466 uint64_t half[16*2 + 17*2];\
2467 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2468 uint8_t * const halfHV= ((uint8_t*)half);\
2469 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2470 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2471 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2472 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2474 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2475 uint64_t half[16*2 + 17*2];\
2476 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2477 uint8_t * const halfHV= ((uint8_t*)half);\
2478 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2479 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2480 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2481 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2483 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2484 uint64_t half[16*2 + 17*2];\
2485 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2486 uint8_t * const halfHV= ((uint8_t*)half);\
2487 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2488 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2489 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2490 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2492 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2493 uint64_t half[16*2 + 17*2];\
2494 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2495 uint8_t * const halfHV= ((uint8_t*)half);\
2496 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2498 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2500 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2501 uint64_t half[16*2 + 17*2];\
2502 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2503 uint8_t * const halfHV= ((uint8_t*)half);\
2504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2505 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2506 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2508 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2509 uint64_t half[17*2];\
2510 uint8_t * const halfH= ((uint8_t*)half);\
2511 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2512 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2513 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2515 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2516 uint64_t half[17*2];\
2517 uint8_t * const halfH= ((uint8_t*)half);\
2518 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2519 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2520 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2522 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2523 uint64_t half[17*2];\
2524 uint8_t * const halfH= ((uint8_t*)half);\
2525 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2526 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2529 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2530 #define AVG_3DNOW_OP(a,b,temp, size) \
2531 "mov" #size " " #b ", " #temp " \n\t"\
2532 "pavgusb " #temp ", " #a " \n\t"\
2533 "mov" #size " " #a ", " #b " \n\t"
2534 #define AVG_MMX2_OP(a,b,temp, size) \
2535 "mov" #size " " #b ", " #temp " \n\t"\
2536 "pavgb " #temp ", " #a " \n\t"\
2537 "mov" #size " " #a ", " #b " \n\t"
2539 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2540 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2541 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2542 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2543 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2544 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2545 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2546 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2547 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2549 /***********************************/
2550 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2552 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2553 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2554 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2556 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2557 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2558 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2561 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2562 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2563 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2564 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2565 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2566 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2567 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2568 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2569 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2570 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2571 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2572 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2574 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2575 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2577 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2578 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2579 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2580 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2581 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2582 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2583 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2584 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2586 QPEL_2TAP(put_, 16, mmx2)
2587 QPEL_2TAP(avg_, 16, mmx2)
2588 QPEL_2TAP(put_, 8, mmx2)
2589 QPEL_2TAP(avg_, 8, mmx2)
2590 QPEL_2TAP(put_, 16, 3dnow)
2591 QPEL_2TAP(avg_, 16, 3dnow)
2592 QPEL_2TAP(put_, 8, 3dnow)
2593 QPEL_2TAP(avg_, 8, 3dnow)
2597 static void just_return() { return; }
2600 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2601 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2603 const int ix = ox>>(16+shift);
2604 const int iy = oy>>(16+shift);
2605 const int oxs = ox>>4;
2606 const int oys = oy>>4;
2607 const int dxxs = dxx>>4;
2608 const int dxys = dxy>>4;
2609 const int dyxs = dyx>>4;
2610 const int dyys = dyy>>4;
2611 const uint16_t r4[4] = {r,r,r,r};
2612 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2613 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2614 const uint64_t shift2 = 2*shift;
2615 uint8_t edge_buf[(h+1)*stride];
2618 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2619 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2620 const int dxh = dxy*(h-1);
2621 const int dyw = dyx*(w-1);
2622 if( // non-constant fullpel offset (3% of blocks)
2623 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
2624 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
2625 // uses more than 16 bits of subpel mv (only at huge resolution)
2626 || (dxx|dxy|dyx|dyy)&15 )
2628 //FIXME could still use mmx for some of the rows
2629 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2633 src += ix + iy*stride;
2634 if( (unsigned)ix >= width-w ||
2635 (unsigned)iy >= height-h )
2637 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2642 "movd %0, %%mm6 \n\t"
2643 "pxor %%mm7, %%mm7 \n\t"
2644 "punpcklwd %%mm6, %%mm6 \n\t"
2645 "punpcklwd %%mm6, %%mm6 \n\t"
2649 for(x=0; x<w; x+=4){
2650 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2651 oxs - dxys + dxxs*(x+1),
2652 oxs - dxys + dxxs*(x+2),
2653 oxs - dxys + dxxs*(x+3) };
2654 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2655 oys - dyys + dyxs*(x+1),
2656 oys - dyys + dyxs*(x+2),
2657 oys - dyys + dyxs*(x+3) };
2661 "movq %0, %%mm4 \n\t"
2662 "movq %1, %%mm5 \n\t"
2663 "paddw %2, %%mm4 \n\t"
2664 "paddw %3, %%mm5 \n\t"
2665 "movq %%mm4, %0 \n\t"
2666 "movq %%mm5, %1 \n\t"
2667 "psrlw $12, %%mm4 \n\t"
2668 "psrlw $12, %%mm5 \n\t"
2669 : "+m"(*dx4), "+m"(*dy4)
2670 : "m"(*dxy4), "m"(*dyy4)
2674 "movq %%mm6, %%mm2 \n\t"
2675 "movq %%mm6, %%mm1 \n\t"
2676 "psubw %%mm4, %%mm2 \n\t"
2677 "psubw %%mm5, %%mm1 \n\t"
2678 "movq %%mm2, %%mm0 \n\t"
2679 "movq %%mm4, %%mm3 \n\t"
2680 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2681 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2682 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2683 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2685 "movd %4, %%mm5 \n\t"
2686 "movd %3, %%mm4 \n\t"
2687 "punpcklbw %%mm7, %%mm5 \n\t"
2688 "punpcklbw %%mm7, %%mm4 \n\t"
2689 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2690 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2692 "movd %2, %%mm5 \n\t"
2693 "movd %1, %%mm4 \n\t"
2694 "punpcklbw %%mm7, %%mm5 \n\t"
2695 "punpcklbw %%mm7, %%mm4 \n\t"
2696 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2697 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2698 "paddw %5, %%mm1 \n\t"
2699 "paddw %%mm3, %%mm2 \n\t"
2700 "paddw %%mm1, %%mm0 \n\t"
2701 "paddw %%mm2, %%mm0 \n\t"
2703 "psrlw %6, %%mm0 \n\t"
2704 "packuswb %%mm0, %%mm0 \n\t"
2705 "movd %%mm0, %0 \n\t"
2707 : "=m"(dst[x+y*stride])
2708 : "m"(src[0]), "m"(src[1]),
2709 "m"(src[stride]), "m"(src[stride+1]),
2710 "m"(*r4), "m"(shift2)
2718 #ifdef CONFIG_ENCODERS
2720 #define PHADDD(a, t)\
2721 "movq "#a", "#t" \n\t"\
2722 "psrlq $32, "#a" \n\t"\
2723 "paddd "#t", "#a" \n\t"
2725 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2726 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2727 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2729 #define PMULHRW(x, y, s, o)\
2730 "pmulhw " #s ", "#x " \n\t"\
2731 "pmulhw " #s ", "#y " \n\t"\
2732 "paddw " #o ", "#x " \n\t"\
2733 "paddw " #o ", "#y " \n\t"\
2734 "psraw $1, "#x " \n\t"\
2735 "psraw $1, "#y " \n\t"
2736 #define DEF(x) x ## _mmx
2737 #define SET_RND MOVQ_WONE
2738 #define SCALE_OFFSET 1
2740 #include "dsputil_mmx_qns.h"
2747 #define DEF(x) x ## _3dnow
2749 #define SCALE_OFFSET 0
2750 #define PMULHRW(x, y, s, o)\
2751 "pmulhrw " #s ", "#x " \n\t"\
2752 "pmulhrw " #s ", "#y " \n\t"
2754 #include "dsputil_mmx_qns.h"
2763 #define DEF(x) x ## _ssse3
2765 #define SCALE_OFFSET -1
2766 #define PHADDD(a, t)\
2767 "pshufw $0x0E, "#a", "#t" \n\t"\
2768 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2769 #define PMULHRW(x, y, s, o)\
2770 "pmulhrsw " #s ", "#x " \n\t"\
2771 "pmulhrsw " #s ", "#y " \n\t"
2773 #include "dsputil_mmx_qns.h"
2782 #endif /* CONFIG_ENCODERS */
2784 #define PREFETCH(name, op) \
2785 static void name(void *mem, int stride, int h){\
2786 const uint8_t *p= mem;\
2788 asm volatile(#op" %0" :: "m"(*p));\
2792 PREFETCH(prefetch_mmx2, prefetcht0)
2793 PREFETCH(prefetch_3dnow, prefetch)
2796 #include "h264dsp_mmx.c"
2799 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2801 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2802 put_pixels8_mmx(dst, src, stride, 8);
2804 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2805 avg_pixels8_mmx(dst, src, stride, 8);
2807 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2808 put_pixels16_mmx(dst, src, stride, 16);
2810 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2811 avg_pixels16_mmx(dst, src, stride, 16);
2815 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2819 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2821 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2822 put_pixels8_mmx(dst, src, stride, 8);
2825 /* external functions, from idct_mmx.c */
2826 void ff_mmx_idct(DCTELEM *block);
2827 void ff_mmxext_idct(DCTELEM *block);
2829 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2832 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2834 ff_mmx_idct (block);
2835 put_pixels_clamped_mmx(block, dest, line_size);
2837 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2839 ff_mmx_idct (block);
2840 add_pixels_clamped_mmx(block, dest, line_size);
2842 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2844 ff_mmxext_idct (block);
2845 put_pixels_clamped_mmx(block, dest, line_size);
2847 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2849 ff_mmxext_idct (block);
2850 add_pixels_clamped_mmx(block, dest, line_size);
2853 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2855 ff_idct_xvid_mmx (block);
2856 put_pixels_clamped_mmx(block, dest, line_size);
2858 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2860 ff_idct_xvid_mmx (block);
2861 add_pixels_clamped_mmx(block, dest, line_size);
2863 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2865 ff_idct_xvid_mmx2 (block);
2866 put_pixels_clamped_mmx(block, dest, line_size);
2868 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2870 ff_idct_xvid_mmx2 (block);
2871 add_pixels_clamped_mmx(block, dest, line_size);
2874 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2877 asm volatile("pxor %%mm7, %%mm7":);
2878 for(i=0; i<blocksize; i+=2) {
2880 "movq %0, %%mm0 \n\t"
2881 "movq %1, %%mm1 \n\t"
2882 "movq %%mm0, %%mm2 \n\t"
2883 "movq %%mm1, %%mm3 \n\t"
2884 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2885 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2886 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2887 "pxor %%mm2, %%mm1 \n\t"
2888 "movq %%mm3, %%mm4 \n\t"
2889 "pand %%mm1, %%mm3 \n\t"
2890 "pandn %%mm1, %%mm4 \n\t"
2891 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2892 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2893 "movq %%mm3, %1 \n\t"
2894 "movq %%mm0, %0 \n\t"
2895 :"+m"(mag[i]), "+m"(ang[i])
2899 asm volatile("femms");
2901 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2906 "movaps %0, %%xmm5 \n\t"
2907 ::"m"(ff_pdw_80000000[0])
2909 for(i=0; i<blocksize; i+=4) {
2911 "movaps %0, %%xmm0 \n\t"
2912 "movaps %1, %%xmm1 \n\t"
2913 "xorps %%xmm2, %%xmm2 \n\t"
2914 "xorps %%xmm3, %%xmm3 \n\t"
2915 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2916 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2917 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2918 "xorps %%xmm2, %%xmm1 \n\t"
2919 "movaps %%xmm3, %%xmm4 \n\t"
2920 "andps %%xmm1, %%xmm3 \n\t"
2921 "andnps %%xmm1, %%xmm4 \n\t"
2922 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2923 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2924 "movaps %%xmm3, %1 \n\t"
2925 "movaps %%xmm0, %0 \n\t"
2926 :"+m"(mag[i]), "+m"(ang[i])
2932 static void vector_fmul_3dnow(float *dst, const float *src, int len){
2936 "movq (%1,%0), %%mm0 \n\t"
2937 "movq 8(%1,%0), %%mm1 \n\t"
2938 "pfmul (%2,%0), %%mm0 \n\t"
2939 "pfmul 8(%2,%0), %%mm1 \n\t"
2940 "movq %%mm0, (%1,%0) \n\t"
2941 "movq %%mm1, 8(%1,%0) \n\t"
2950 static void vector_fmul_sse(float *dst, const float *src, int len){
2954 "movaps (%1,%0), %%xmm0 \n\t"
2955 "movaps 16(%1,%0), %%xmm1 \n\t"
2956 "mulps (%2,%0), %%xmm0 \n\t"
2957 "mulps 16(%2,%0), %%xmm1 \n\t"
2958 "movaps %%xmm0, (%1,%0) \n\t"
2959 "movaps %%xmm1, 16(%1,%0) \n\t"
2968 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2972 "pswapd 8(%1), %%mm0 \n\t"
2973 "pswapd (%1), %%mm1 \n\t"
2974 "pfmul (%3,%0), %%mm0 \n\t"
2975 "pfmul 8(%3,%0), %%mm1 \n\t"
2976 "movq %%mm0, (%2,%0) \n\t"
2977 "movq %%mm1, 8(%2,%0) \n\t"
2981 :"+r"(i), "+r"(src1)
2982 :"r"(dst), "r"(src0)
2984 asm volatile("femms");
2986 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2990 "movaps 16(%1), %%xmm0 \n\t"
2991 "movaps (%1), %%xmm1 \n\t"
2992 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2993 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2994 "mulps (%3,%0), %%xmm0 \n\t"
2995 "mulps 16(%3,%0), %%xmm1 \n\t"
2996 "movaps %%xmm0, (%2,%0) \n\t"
2997 "movaps %%xmm1, 16(%2,%0) \n\t"
3001 :"+r"(i), "+r"(src1)
3002 :"r"(dst), "r"(src0)
3006 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3007 const float *src2, int src3, int len, int step){
3009 if(step == 2 && src3 == 0){
3013 "movq (%2,%0), %%mm0 \n\t"
3014 "movq 8(%2,%0), %%mm1 \n\t"
3015 "pfmul (%3,%0), %%mm0 \n\t"
3016 "pfmul 8(%3,%0), %%mm1 \n\t"
3017 "pfadd (%4,%0), %%mm0 \n\t"
3018 "pfadd 8(%4,%0), %%mm1 \n\t"
3019 "movd %%mm0, (%1) \n\t"
3020 "movd %%mm1, 16(%1) \n\t"
3021 "psrlq $32, %%mm0 \n\t"
3022 "psrlq $32, %%mm1 \n\t"
3023 "movd %%mm0, 8(%1) \n\t"
3024 "movd %%mm1, 24(%1) \n\t"
3029 :"r"(src0), "r"(src1), "r"(src2)
3033 else if(step == 1 && src3 == 0){
3036 "movq (%2,%0), %%mm0 \n\t"
3037 "movq 8(%2,%0), %%mm1 \n\t"
3038 "pfmul (%3,%0), %%mm0 \n\t"
3039 "pfmul 8(%3,%0), %%mm1 \n\t"
3040 "pfadd (%4,%0), %%mm0 \n\t"
3041 "pfadd 8(%4,%0), %%mm1 \n\t"
3042 "movq %%mm0, (%1,%0) \n\t"
3043 "movq %%mm1, 8(%1,%0) \n\t"
3047 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3052 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3053 asm volatile("femms");
3055 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3056 const float *src2, int src3, int len, int step){
3058 if(step == 2 && src3 == 0){
3062 "movaps (%2,%0), %%xmm0 \n\t"
3063 "movaps 16(%2,%0), %%xmm1 \n\t"
3064 "mulps (%3,%0), %%xmm0 \n\t"
3065 "mulps 16(%3,%0), %%xmm1 \n\t"
3066 "addps (%4,%0), %%xmm0 \n\t"
3067 "addps 16(%4,%0), %%xmm1 \n\t"
3068 "movss %%xmm0, (%1) \n\t"
3069 "movss %%xmm1, 32(%1) \n\t"
3070 "movhlps %%xmm0, %%xmm2 \n\t"
3071 "movhlps %%xmm1, %%xmm3 \n\t"
3072 "movss %%xmm2, 16(%1) \n\t"
3073 "movss %%xmm3, 48(%1) \n\t"
3074 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3075 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3076 "movss %%xmm0, 8(%1) \n\t"
3077 "movss %%xmm1, 40(%1) \n\t"
3078 "movhlps %%xmm0, %%xmm2 \n\t"
3079 "movhlps %%xmm1, %%xmm3 \n\t"
3080 "movss %%xmm2, 24(%1) \n\t"
3081 "movss %%xmm3, 56(%1) \n\t"
3086 :"r"(src0), "r"(src1), "r"(src2)
3090 else if(step == 1 && src3 == 0){
3093 "movaps (%2,%0), %%xmm0 \n\t"
3094 "movaps 16(%2,%0), %%xmm1 \n\t"
3095 "mulps (%3,%0), %%xmm0 \n\t"
3096 "mulps 16(%3,%0), %%xmm1 \n\t"
3097 "addps (%4,%0), %%xmm0 \n\t"
3098 "addps 16(%4,%0), %%xmm1 \n\t"
3099 "movaps %%xmm0, (%1,%0) \n\t"
3100 "movaps %%xmm1, 16(%1,%0) \n\t"
3104 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3109 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3112 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3113 // not bit-exact: pf2id uses different rounding than C and SSE
3115 for(i=0; i<len; i+=4) {
3117 "pf2id %1, %%mm0 \n\t"
3118 "pf2id %2, %%mm1 \n\t"
3119 "packssdw %%mm1, %%mm0 \n\t"
3120 "movq %%mm0, %0 \n\t"
3122 :"m"(src[i]), "m"(src[i+2])
3125 asm volatile("femms");
3127 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3129 for(i=0; i<len; i+=4) {
3131 "cvtps2pi %1, %%mm0 \n\t"
3132 "cvtps2pi %2, %%mm1 \n\t"
3133 "packssdw %%mm1, %%mm0 \n\t"
3134 "movq %%mm0, %0 \n\t"
3136 :"m"(src[i]), "m"(src[i+2])
3139 asm volatile("emms");
3142 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
3143 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
3144 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
3145 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
3146 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3147 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3148 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3149 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3151 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3153 mm_flags = mm_support();
3155 if (avctx->dsp_mask) {
3156 if (avctx->dsp_mask & FF_MM_FORCE)
3157 mm_flags |= (avctx->dsp_mask & 0xffff);
3159 mm_flags &= ~(avctx->dsp_mask & 0xffff);
3163 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3164 if (mm_flags & MM_MMX)
3165 av_log(avctx, AV_LOG_INFO, " mmx");
3166 if (mm_flags & MM_MMXEXT)
3167 av_log(avctx, AV_LOG_INFO, " mmxext");
3168 if (mm_flags & MM_3DNOW)
3169 av_log(avctx, AV_LOG_INFO, " 3dnow");
3170 if (mm_flags & MM_SSE)
3171 av_log(avctx, AV_LOG_INFO, " sse");
3172 if (mm_flags & MM_SSE2)
3173 av_log(avctx, AV_LOG_INFO, " sse2");
3174 av_log(avctx, AV_LOG_INFO, "\n");
3177 if (mm_flags & MM_MMX) {
3178 const int idct_algo= avctx->idct_algo;
3180 #ifdef CONFIG_ENCODERS
3181 const int dct_algo = avctx->dct_algo;
3182 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3183 if(mm_flags & MM_SSE2){
3184 c->fdct = ff_fdct_sse2;
3185 }else if(mm_flags & MM_MMXEXT){
3186 c->fdct = ff_fdct_mmx2;
3188 c->fdct = ff_fdct_mmx;
3191 #endif //CONFIG_ENCODERS
3192 if(avctx->lowres==0){
3193 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3194 c->idct_put= ff_simple_idct_put_mmx;
3195 c->idct_add= ff_simple_idct_add_mmx;
3196 c->idct = ff_simple_idct_mmx;
3197 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3199 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3200 if(mm_flags & MM_MMXEXT){
3201 c->idct_put= ff_libmpeg2mmx2_idct_put;
3202 c->idct_add= ff_libmpeg2mmx2_idct_add;
3203 c->idct = ff_mmxext_idct;
3205 c->idct_put= ff_libmpeg2mmx_idct_put;
3206 c->idct_add= ff_libmpeg2mmx_idct_add;
3207 c->idct = ff_mmx_idct;
3209 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3211 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3212 idct_algo==FF_IDCT_VP3 &&
3213 avctx->codec->id!=CODEC_ID_THEORA &&
3214 !(avctx->flags & CODEC_FLAG_BITEXACT)){
3215 if(mm_flags & MM_SSE2){
3216 c->idct_put= ff_vp3_idct_put_sse2;
3217 c->idct_add= ff_vp3_idct_add_sse2;
3218 c->idct = ff_vp3_idct_sse2;
3219 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3221 ff_vp3_dsp_init_mmx();
3222 c->idct_put= ff_vp3_idct_put_mmx;
3223 c->idct_add= ff_vp3_idct_add_mmx;
3224 c->idct = ff_vp3_idct_mmx;
3225 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3227 }else if(idct_algo==FF_IDCT_CAVS){
3228 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3229 }else if(idct_algo==FF_IDCT_XVIDMMX){
3230 if(mm_flags & MM_MMXEXT){
3231 c->idct_put= ff_idct_xvid_mmx2_put;
3232 c->idct_add= ff_idct_xvid_mmx2_add;
3233 c->idct = ff_idct_xvid_mmx2;
3235 c->idct_put= ff_idct_xvid_mmx_put;
3236 c->idct_add= ff_idct_xvid_mmx_add;
3237 c->idct = ff_idct_xvid_mmx;
3242 #ifdef CONFIG_ENCODERS
3243 c->get_pixels = get_pixels_mmx;
3244 c->diff_pixels = diff_pixels_mmx;
3245 #endif //CONFIG_ENCODERS
3246 c->put_pixels_clamped = put_pixels_clamped_mmx;
3247 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3248 c->add_pixels_clamped = add_pixels_clamped_mmx;
3249 c->clear_blocks = clear_blocks_mmx;
3250 #ifdef CONFIG_ENCODERS
3251 c->pix_sum = pix_sum16_mmx;
3252 #endif //CONFIG_ENCODERS
3254 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
3255 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
3256 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
3257 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
3258 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
3260 SET_HPEL_FUNCS(put, 0, 16, mmx);
3261 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
3262 SET_HPEL_FUNCS(avg, 0, 16, mmx);
3263 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
3264 SET_HPEL_FUNCS(put, 1, 8, mmx);
3265 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
3266 SET_HPEL_FUNCS(avg, 1, 8, mmx);
3267 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
3271 c->add_bytes= add_bytes_mmx;
3272 #ifdef CONFIG_ENCODERS
3273 c->diff_bytes= diff_bytes_mmx;
3274 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
3276 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
3277 c->hadamard8_diff[1]= hadamard8_diff_mmx;
3279 c->pix_norm1 = pix_norm1_mmx;
3280 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
3281 c->sse[1] = sse8_mmx;
3282 c->vsad[4]= vsad_intra16_mmx;
3284 c->nsse[0] = nsse16_mmx;
3285 c->nsse[1] = nsse8_mmx;
3286 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3287 c->vsad[0] = vsad16_mmx;
3290 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3291 c->try_8x8basis= try_8x8basis_mmx;
3293 c->add_8x8basis= add_8x8basis_mmx;
3295 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
3297 #endif //CONFIG_ENCODERS
3299 if (ENABLE_ANY_H263) {
3300 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
3301 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
3303 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
3304 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
3305 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
3307 c->h264_idct_dc_add=
3308 c->h264_idct_add= ff_h264_idct_add_mmx;
3309 c->h264_idct8_dc_add=
3310 c->h264_idct8_add= ff_h264_idct8_add_mmx;
3311 if (mm_flags & MM_SSE2)
3312 c->h264_idct8_add= ff_h264_idct8_add_sse2;
3314 if (mm_flags & MM_MMXEXT) {
3315 c->prefetch = prefetch_mmx2;
3317 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
3318 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
3320 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
3321 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
3322 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
3324 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
3325 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
3327 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
3328 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
3329 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
3331 #ifdef CONFIG_ENCODERS
3332 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
3333 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
3334 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
3335 c->vsad[4]= vsad_intra16_mmx2;
3336 #endif //CONFIG_ENCODERS
3338 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
3339 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
3341 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3342 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
3343 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
3344 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
3345 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
3346 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
3347 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3348 #ifdef CONFIG_ENCODERS
3349 c->vsad[0] = vsad16_mmx2;
3350 #endif //CONFIG_ENCODERS
3353 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
3354 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
3355 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
3356 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
3357 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
3358 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
3359 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
3360 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
3361 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
3362 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
3363 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
3364 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
3365 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
3366 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
3367 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
3368 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
3369 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
3371 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
3372 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
3373 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
3374 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
3375 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
3376 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
3378 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
3379 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
3380 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
3381 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
3382 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
3383 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
3385 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
3386 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
3387 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
3388 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
3390 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
3391 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3392 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
3393 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
3394 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
3395 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
3396 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
3397 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
3398 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
3399 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3400 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
3402 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3403 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3404 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3405 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3406 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3407 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3408 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3409 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3411 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3412 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3413 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3414 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3415 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3416 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3417 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3418 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3420 if (ENABLE_CAVS_DECODER)
3421 ff_cavsdsp_init_mmx2(c, avctx);
3423 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
3424 ff_vc1dsp_init_mmx(c, avctx);
3426 #ifdef CONFIG_ENCODERS
3427 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3428 #endif //CONFIG_ENCODERS
3429 } else if (mm_flags & MM_3DNOW) {
3430 c->prefetch = prefetch_3dnow;
3432 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
3433 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
3435 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
3436 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
3437 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
3439 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
3440 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
3442 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
3443 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
3444 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
3446 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3447 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
3448 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
3449 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
3450 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
3451 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
3452 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
3455 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
3456 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
3457 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
3458 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
3459 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
3460 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
3462 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
3463 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
3464 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
3465 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
3466 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
3467 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
3469 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
3470 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
3471 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
3472 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
3474 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
3475 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
3478 #ifdef CONFIG_ENCODERS
3479 if(mm_flags & MM_SSE2){
3480 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3481 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3482 c->hadamard8_diff[1]= hadamard8_diff_sse2;
3483 if (ENABLE_FLAC_ENCODER)
3484 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
3488 if(mm_flags & MM_SSSE3){
3489 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3490 c->try_8x8basis= try_8x8basis_ssse3;
3492 c->add_8x8basis= add_8x8basis_ssse3;
3493 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
3494 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
3495 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
3500 #ifdef CONFIG_SNOW_DECODER
3501 if(mm_flags & MM_SSE2 & 0){
3502 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3504 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3506 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3509 if(mm_flags & MM_MMXEXT){
3510 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3512 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3515 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3519 if(mm_flags & MM_3DNOW){
3520 #ifdef CONFIG_ENCODERS
3521 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3522 c->try_8x8basis= try_8x8basis_3dnow;
3524 c->add_8x8basis= add_8x8basis_3dnow;
3525 #endif //CONFIG_ENCODERS
3526 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
3527 c->vector_fmul = vector_fmul_3dnow;
3528 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
3529 c->float_to_int16 = float_to_int16_3dnow;
3531 if(mm_flags & MM_3DNOWEXT)
3532 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
3533 if(mm_flags & MM_SSE){
3534 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3535 c->vector_fmul = vector_fmul_sse;
3536 c->float_to_int16 = float_to_int16_sse;
3537 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3538 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3540 if(mm_flags & MM_3DNOW)
3541 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
3544 #ifdef CONFIG_ENCODERS
3545 dsputil_init_pix_mmx(c, avctx);
3546 #endif //CONFIG_ENCODERS
3548 // for speed testing
3549 get_pixels = just_return;
3550 put_pixels_clamped = just_return;
3551 add_pixels_clamped = just_return;
3553 pix_abs16x16 = just_return;
3554 pix_abs16x16_x2 = just_return;
3555 pix_abs16x16_y2 = just_return;
3556 pix_abs16x16_xy2 = just_return;
3558 put_pixels_tab[0] = just_return;
3559 put_pixels_tab[1] = just_return;
3560 put_pixels_tab[2] = just_return;
3561 put_pixels_tab[3] = just_return;
3563 put_no_rnd_pixels_tab[0] = just_return;
3564 put_no_rnd_pixels_tab[1] = just_return;
3565 put_no_rnd_pixels_tab[2] = just_return;
3566 put_no_rnd_pixels_tab[3] = just_return;
3568 avg_pixels_tab[0] = just_return;
3569 avg_pixels_tab[1] = just_return;
3570 avg_pixels_tab[2] = just_return;
3571 avg_pixels_tab[3] = just_return;
3573 avg_no_rnd_pixels_tab[0] = just_return;
3574 avg_no_rnd_pixels_tab[1] = just_return;
3575 avg_no_rnd_pixels_tab[2] = just_return;
3576 avg_no_rnd_pixels_tab[3] = just_return;
3578 //av_fdct = just_return;
3579 //ff_idct = just_return;