2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "dsputil_mmx.h"
27 #include "simple_idct.h"
28 #include "mpegvideo.h"
31 #include "vp3dsp_mmx.h"
32 #include "vp3dsp_sse2.h"
38 extern void ff_idct_xvid_mmx(short *block);
39 extern void ff_idct_xvid_mmx2(short *block);
41 int mm_flags; /* multimedia extension flags */
43 /* pixel operations */
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wone) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
47 DECLARE_ALIGNED_8 (const uint64_t, ff_wabs) = 0xFFFFFFFFFFFFFFFFULL;
49 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
50 {0x8000000080000000ULL, 0x8000000080000000ULL};
52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5 ) = 0x0005000500050005ULL;
55 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
57 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
62 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
63 DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_5F ) = 0x5F5F5F5F5F5F5F5FULL;
70 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
71 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
73 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
74 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
76 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
77 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
79 #define MOVQ_WONE(regd) \
81 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
82 "psrlw $15, %%" #regd ::)
84 #define MOVQ_BFE(regd) \
86 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
87 "paddb %%" #regd ", %%" #regd " \n\t" ::)
90 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
91 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
93 // for shared library it's better to use this way for accessing constants
95 #define MOVQ_BONE(regd) \
97 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
98 "psrlw $15, %%" #regd " \n\t" \
99 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
101 #define MOVQ_WTWO(regd) \
103 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
104 "psrlw $15, %%" #regd " \n\t" \
105 "psllw $1, %%" #regd " \n\t"::)
109 // using regr as temporary and for the output result
110 // first argument is unmodifed and second is trashed
111 // regfe is supposed to contain 0xfefefefefefefefe
112 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
113 "movq " #rega ", " #regr " \n\t"\
114 "pand " #regb ", " #regr " \n\t"\
115 "pxor " #rega ", " #regb " \n\t"\
116 "pand " #regfe "," #regb " \n\t"\
117 "psrlq $1, " #regb " \n\t"\
118 "paddb " #regb ", " #regr " \n\t"
120 #define PAVGB_MMX(rega, regb, regr, regfe) \
121 "movq " #rega ", " #regr " \n\t"\
122 "por " #regb ", " #regr " \n\t"\
123 "pxor " #rega ", " #regb " \n\t"\
124 "pand " #regfe "," #regb " \n\t"\
125 "psrlq $1, " #regb " \n\t"\
126 "psubb " #regb ", " #regr " \n\t"
128 // mm6 is supposed to contain 0xfefefefefefefefe
129 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
130 "movq " #rega ", " #regr " \n\t"\
131 "movq " #regc ", " #regp " \n\t"\
132 "pand " #regb ", " #regr " \n\t"\
133 "pand " #regd ", " #regp " \n\t"\
134 "pxor " #rega ", " #regb " \n\t"\
135 "pxor " #regc ", " #regd " \n\t"\
136 "pand %%mm6, " #regb " \n\t"\
137 "pand %%mm6, " #regd " \n\t"\
138 "psrlq $1, " #regb " \n\t"\
139 "psrlq $1, " #regd " \n\t"\
140 "paddb " #regb ", " #regr " \n\t"\
141 "paddb " #regd ", " #regp " \n\t"
143 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
144 "movq " #rega ", " #regr " \n\t"\
145 "movq " #regc ", " #regp " \n\t"\
146 "por " #regb ", " #regr " \n\t"\
147 "por " #regd ", " #regp " \n\t"\
148 "pxor " #rega ", " #regb " \n\t"\
149 "pxor " #regc ", " #regd " \n\t"\
150 "pand %%mm6, " #regb " \n\t"\
151 "pand %%mm6, " #regd " \n\t"\
152 "psrlq $1, " #regd " \n\t"\
153 "psrlq $1, " #regb " \n\t"\
154 "psubb " #regb ", " #regr " \n\t"\
155 "psubb " #regd ", " #regp " \n\t"
157 /***********************************/
158 /* MMX no rounding */
159 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
160 #define SET_RND MOVQ_WONE
161 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
162 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
164 #include "dsputil_mmx_rnd.h"
170 /***********************************/
173 #define DEF(x, y) x ## _ ## y ##_mmx
174 #define SET_RND MOVQ_WTWO
175 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
176 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
178 #include "dsputil_mmx_rnd.h"
185 /***********************************/
188 #define DEF(x) x ## _3dnow
189 #define PAVGB "pavgusb"
191 #include "dsputil_mmx_avg.h"
196 /***********************************/
199 #define DEF(x) x ## _mmx2
201 /* Introduced only in MMX2 set */
202 #define PAVGB "pavgb"
204 #include "dsputil_mmx_avg.h"
209 #define SBUTTERFLY(a,b,t,n,m)\
210 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
211 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
212 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
214 #define TRANSPOSE4(a,b,c,d,t)\
215 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
216 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
217 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
218 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
220 /***********************************/
223 #ifdef CONFIG_ENCODERS
224 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
227 "mov $-128, %%"REG_a" \n\t"
228 "pxor %%mm7, %%mm7 \n\t"
231 "movq (%0), %%mm0 \n\t"
232 "movq (%0, %2), %%mm2 \n\t"
233 "movq %%mm0, %%mm1 \n\t"
234 "movq %%mm2, %%mm3 \n\t"
235 "punpcklbw %%mm7, %%mm0 \n\t"
236 "punpckhbw %%mm7, %%mm1 \n\t"
237 "punpcklbw %%mm7, %%mm2 \n\t"
238 "punpckhbw %%mm7, %%mm3 \n\t"
239 "movq %%mm0, (%1, %%"REG_a") \n\t"
240 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
241 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
242 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
244 "add $32, %%"REG_a" \n\t"
247 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
252 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
255 "pxor %%mm7, %%mm7 \n\t"
256 "mov $-128, %%"REG_a" \n\t"
259 "movq (%0), %%mm0 \n\t"
260 "movq (%1), %%mm2 \n\t"
261 "movq %%mm0, %%mm1 \n\t"
262 "movq %%mm2, %%mm3 \n\t"
263 "punpcklbw %%mm7, %%mm0 \n\t"
264 "punpckhbw %%mm7, %%mm1 \n\t"
265 "punpcklbw %%mm7, %%mm2 \n\t"
266 "punpckhbw %%mm7, %%mm3 \n\t"
267 "psubw %%mm2, %%mm0 \n\t"
268 "psubw %%mm3, %%mm1 \n\t"
269 "movq %%mm0, (%2, %%"REG_a") \n\t"
270 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
273 "add $16, %%"REG_a" \n\t"
275 : "+r" (s1), "+r" (s2)
276 : "r" (block+64), "r" ((long)stride)
280 #endif //CONFIG_ENCODERS
282 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
287 /* read the pixels */
292 "movq %3, %%mm0 \n\t"
293 "movq 8%3, %%mm1 \n\t"
294 "movq 16%3, %%mm2 \n\t"
295 "movq 24%3, %%mm3 \n\t"
296 "movq 32%3, %%mm4 \n\t"
297 "movq 40%3, %%mm5 \n\t"
298 "movq 48%3, %%mm6 \n\t"
299 "movq 56%3, %%mm7 \n\t"
300 "packuswb %%mm1, %%mm0 \n\t"
301 "packuswb %%mm3, %%mm2 \n\t"
302 "packuswb %%mm5, %%mm4 \n\t"
303 "packuswb %%mm7, %%mm6 \n\t"
304 "movq %%mm0, (%0) \n\t"
305 "movq %%mm2, (%0, %1) \n\t"
306 "movq %%mm4, (%0, %1, 2) \n\t"
307 "movq %%mm6, (%0, %2) \n\t"
308 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
313 // if here would be an exact copy of the code above
314 // compiler would generate some very strange code
317 "movq (%3), %%mm0 \n\t"
318 "movq 8(%3), %%mm1 \n\t"
319 "movq 16(%3), %%mm2 \n\t"
320 "movq 24(%3), %%mm3 \n\t"
321 "movq 32(%3), %%mm4 \n\t"
322 "movq 40(%3), %%mm5 \n\t"
323 "movq 48(%3), %%mm6 \n\t"
324 "movq 56(%3), %%mm7 \n\t"
325 "packuswb %%mm1, %%mm0 \n\t"
326 "packuswb %%mm3, %%mm2 \n\t"
327 "packuswb %%mm5, %%mm4 \n\t"
328 "packuswb %%mm7, %%mm6 \n\t"
329 "movq %%mm0, (%0) \n\t"
330 "movq %%mm2, (%0, %1) \n\t"
331 "movq %%mm4, (%0, %1, 2) \n\t"
332 "movq %%mm6, (%0, %2) \n\t"
333 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
337 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
338 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
340 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
344 movq_m2r(*vector128, mm1);
345 for (i = 0; i < 8; i++) {
346 movq_m2r(*(block), mm0);
347 packsswb_m2r(*(block + 4), mm0);
350 movq_r2m(mm0, *pixels);
355 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
361 /* read the pixels */
368 "movq (%2), %%mm0 \n\t"
369 "movq 8(%2), %%mm1 \n\t"
370 "movq 16(%2), %%mm2 \n\t"
371 "movq 24(%2), %%mm3 \n\t"
372 "movq %0, %%mm4 \n\t"
373 "movq %1, %%mm6 \n\t"
374 "movq %%mm4, %%mm5 \n\t"
375 "punpcklbw %%mm7, %%mm4 \n\t"
376 "punpckhbw %%mm7, %%mm5 \n\t"
377 "paddsw %%mm4, %%mm0 \n\t"
378 "paddsw %%mm5, %%mm1 \n\t"
379 "movq %%mm6, %%mm5 \n\t"
380 "punpcklbw %%mm7, %%mm6 \n\t"
381 "punpckhbw %%mm7, %%mm5 \n\t"
382 "paddsw %%mm6, %%mm2 \n\t"
383 "paddsw %%mm5, %%mm3 \n\t"
384 "packuswb %%mm1, %%mm0 \n\t"
385 "packuswb %%mm3, %%mm2 \n\t"
386 "movq %%mm0, %0 \n\t"
387 "movq %%mm2, %1 \n\t"
388 :"+m"(*pix), "+m"(*(pix+line_size))
396 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
399 "lea (%3, %3), %%"REG_a" \n\t"
402 "movd (%1), %%mm0 \n\t"
403 "movd (%1, %3), %%mm1 \n\t"
404 "movd %%mm0, (%2) \n\t"
405 "movd %%mm1, (%2, %3) \n\t"
406 "add %%"REG_a", %1 \n\t"
407 "add %%"REG_a", %2 \n\t"
408 "movd (%1), %%mm0 \n\t"
409 "movd (%1, %3), %%mm1 \n\t"
410 "movd %%mm0, (%2) \n\t"
411 "movd %%mm1, (%2, %3) \n\t"
412 "add %%"REG_a", %1 \n\t"
413 "add %%"REG_a", %2 \n\t"
416 : "+g"(h), "+r" (pixels), "+r" (block)
417 : "r"((long)line_size)
422 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
425 "lea (%3, %3), %%"REG_a" \n\t"
428 "movq (%1), %%mm0 \n\t"
429 "movq (%1, %3), %%mm1 \n\t"
430 "movq %%mm0, (%2) \n\t"
431 "movq %%mm1, (%2, %3) \n\t"
432 "add %%"REG_a", %1 \n\t"
433 "add %%"REG_a", %2 \n\t"
434 "movq (%1), %%mm0 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm1, (%2, %3) \n\t"
438 "add %%"REG_a", %1 \n\t"
439 "add %%"REG_a", %2 \n\t"
442 : "+g"(h), "+r" (pixels), "+r" (block)
443 : "r"((long)line_size)
448 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
451 "lea (%3, %3), %%"REG_a" \n\t"
454 "movq (%1), %%mm0 \n\t"
455 "movq 8(%1), %%mm4 \n\t"
456 "movq (%1, %3), %%mm1 \n\t"
457 "movq 8(%1, %3), %%mm5 \n\t"
458 "movq %%mm0, (%2) \n\t"
459 "movq %%mm4, 8(%2) \n\t"
460 "movq %%mm1, (%2, %3) \n\t"
461 "movq %%mm5, 8(%2, %3) \n\t"
462 "add %%"REG_a", %1 \n\t"
463 "add %%"REG_a", %2 \n\t"
464 "movq (%1), %%mm0 \n\t"
465 "movq 8(%1), %%mm4 \n\t"
466 "movq (%1, %3), %%mm1 \n\t"
467 "movq 8(%1, %3), %%mm5 \n\t"
468 "movq %%mm0, (%2) \n\t"
469 "movq %%mm4, 8(%2) \n\t"
470 "movq %%mm1, (%2, %3) \n\t"
471 "movq %%mm5, 8(%2, %3) \n\t"
472 "add %%"REG_a", %1 \n\t"
473 "add %%"REG_a", %2 \n\t"
476 : "+g"(h), "+r" (pixels), "+r" (block)
477 : "r"((long)line_size)
482 static void clear_blocks_mmx(DCTELEM *blocks)
485 "pxor %%mm7, %%mm7 \n\t"
486 "mov $-128*6, %%"REG_a" \n\t"
488 "movq %%mm7, (%0, %%"REG_a") \n\t"
489 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
490 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
491 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
492 "add $32, %%"REG_a" \n\t"
494 : : "r" (((uint8_t *)blocks)+128*6)
499 #ifdef CONFIG_ENCODERS
500 static int pix_sum16_mmx(uint8_t * pix, int line_size){
503 long index= -line_size*h;
506 "pxor %%mm7, %%mm7 \n\t"
507 "pxor %%mm6, %%mm6 \n\t"
509 "movq (%2, %1), %%mm0 \n\t"
510 "movq (%2, %1), %%mm1 \n\t"
511 "movq 8(%2, %1), %%mm2 \n\t"
512 "movq 8(%2, %1), %%mm3 \n\t"
513 "punpcklbw %%mm7, %%mm0 \n\t"
514 "punpckhbw %%mm7, %%mm1 \n\t"
515 "punpcklbw %%mm7, %%mm2 \n\t"
516 "punpckhbw %%mm7, %%mm3 \n\t"
517 "paddw %%mm0, %%mm1 \n\t"
518 "paddw %%mm2, %%mm3 \n\t"
519 "paddw %%mm1, %%mm3 \n\t"
520 "paddw %%mm3, %%mm6 \n\t"
523 "movq %%mm6, %%mm5 \n\t"
524 "psrlq $32, %%mm6 \n\t"
525 "paddw %%mm5, %%mm6 \n\t"
526 "movq %%mm6, %%mm5 \n\t"
527 "psrlq $16, %%mm6 \n\t"
528 "paddw %%mm5, %%mm6 \n\t"
529 "movd %%mm6, %0 \n\t"
530 "andl $0xFFFF, %0 \n\t"
531 : "=&r" (sum), "+r" (index)
532 : "r" (pix - index), "r" ((long)line_size)
537 #endif //CONFIG_ENCODERS
539 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
543 "movq (%1, %0), %%mm0 \n\t"
544 "movq (%2, %0), %%mm1 \n\t"
545 "paddb %%mm0, %%mm1 \n\t"
546 "movq %%mm1, (%2, %0) \n\t"
547 "movq 8(%1, %0), %%mm0 \n\t"
548 "movq 8(%2, %0), %%mm1 \n\t"
549 "paddb %%mm0, %%mm1 \n\t"
550 "movq %%mm1, 8(%2, %0) \n\t"
555 : "r"(src), "r"(dst), "r"((long)w-15)
558 dst[i+0] += src[i+0];
561 #define H263_LOOP_FILTER \
562 "pxor %%mm7, %%mm7 \n\t"\
563 "movq %0, %%mm0 \n\t"\
564 "movq %0, %%mm1 \n\t"\
565 "movq %3, %%mm2 \n\t"\
566 "movq %3, %%mm3 \n\t"\
567 "punpcklbw %%mm7, %%mm0 \n\t"\
568 "punpckhbw %%mm7, %%mm1 \n\t"\
569 "punpcklbw %%mm7, %%mm2 \n\t"\
570 "punpckhbw %%mm7, %%mm3 \n\t"\
571 "psubw %%mm2, %%mm0 \n\t"\
572 "psubw %%mm3, %%mm1 \n\t"\
573 "movq %1, %%mm2 \n\t"\
574 "movq %1, %%mm3 \n\t"\
575 "movq %2, %%mm4 \n\t"\
576 "movq %2, %%mm5 \n\t"\
577 "punpcklbw %%mm7, %%mm2 \n\t"\
578 "punpckhbw %%mm7, %%mm3 \n\t"\
579 "punpcklbw %%mm7, %%mm4 \n\t"\
580 "punpckhbw %%mm7, %%mm5 \n\t"\
581 "psubw %%mm2, %%mm4 \n\t"\
582 "psubw %%mm3, %%mm5 \n\t"\
583 "psllw $2, %%mm4 \n\t"\
584 "psllw $2, %%mm5 \n\t"\
585 "paddw %%mm0, %%mm4 \n\t"\
586 "paddw %%mm1, %%mm5 \n\t"\
587 "pxor %%mm6, %%mm6 \n\t"\
588 "pcmpgtw %%mm4, %%mm6 \n\t"\
589 "pcmpgtw %%mm5, %%mm7 \n\t"\
590 "pxor %%mm6, %%mm4 \n\t"\
591 "pxor %%mm7, %%mm5 \n\t"\
592 "psubw %%mm6, %%mm4 \n\t"\
593 "psubw %%mm7, %%mm5 \n\t"\
594 "psrlw $3, %%mm4 \n\t"\
595 "psrlw $3, %%mm5 \n\t"\
596 "packuswb %%mm5, %%mm4 \n\t"\
597 "packsswb %%mm7, %%mm6 \n\t"\
598 "pxor %%mm7, %%mm7 \n\t"\
599 "movd %4, %%mm2 \n\t"\
600 "punpcklbw %%mm2, %%mm2 \n\t"\
601 "punpcklbw %%mm2, %%mm2 \n\t"\
602 "punpcklbw %%mm2, %%mm2 \n\t"\
603 "psubusb %%mm4, %%mm2 \n\t"\
604 "movq %%mm2, %%mm3 \n\t"\
605 "psubusb %%mm4, %%mm3 \n\t"\
606 "psubb %%mm3, %%mm2 \n\t"\
607 "movq %1, %%mm3 \n\t"\
608 "movq %2, %%mm4 \n\t"\
609 "pxor %%mm6, %%mm3 \n\t"\
610 "pxor %%mm6, %%mm4 \n\t"\
611 "paddusb %%mm2, %%mm3 \n\t"\
612 "psubusb %%mm2, %%mm4 \n\t"\
613 "pxor %%mm6, %%mm3 \n\t"\
614 "pxor %%mm6, %%mm4 \n\t"\
615 "paddusb %%mm2, %%mm2 \n\t"\
616 "packsswb %%mm1, %%mm0 \n\t"\
617 "pcmpgtb %%mm0, %%mm7 \n\t"\
618 "pxor %%mm7, %%mm0 \n\t"\
619 "psubb %%mm7, %%mm0 \n\t"\
620 "movq %%mm0, %%mm1 \n\t"\
621 "psubusb %%mm2, %%mm0 \n\t"\
622 "psubb %%mm0, %%mm1 \n\t"\
623 "pand %5, %%mm1 \n\t"\
624 "psrlw $2, %%mm1 \n\t"\
625 "pxor %%mm7, %%mm1 \n\t"\
626 "psubb %%mm7, %%mm1 \n\t"\
627 "movq %0, %%mm5 \n\t"\
628 "movq %3, %%mm6 \n\t"\
629 "psubb %%mm1, %%mm5 \n\t"\
630 "paddb %%mm1, %%mm6 \n\t"
632 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
633 if(ENABLE_ANY_H263) {
634 const int strength= ff_h263_loop_filter_strength[qscale];
640 "movq %%mm3, %1 \n\t"
641 "movq %%mm4, %2 \n\t"
642 "movq %%mm5, %0 \n\t"
643 "movq %%mm6, %3 \n\t"
644 : "+m" (*(uint64_t*)(src - 2*stride)),
645 "+m" (*(uint64_t*)(src - 1*stride)),
646 "+m" (*(uint64_t*)(src + 0*stride)),
647 "+m" (*(uint64_t*)(src + 1*stride))
648 : "g" (2*strength), "m"(ff_pb_FC)
653 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
654 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
655 "movd %4, %%mm0 \n\t"
656 "movd %5, %%mm1 \n\t"
657 "movd %6, %%mm2 \n\t"
658 "movd %7, %%mm3 \n\t"
659 "punpcklbw %%mm1, %%mm0 \n\t"
660 "punpcklbw %%mm3, %%mm2 \n\t"
661 "movq %%mm0, %%mm1 \n\t"
662 "punpcklwd %%mm2, %%mm0 \n\t"
663 "punpckhwd %%mm2, %%mm1 \n\t"
664 "movd %%mm0, %0 \n\t"
665 "punpckhdq %%mm0, %%mm0 \n\t"
666 "movd %%mm0, %1 \n\t"
667 "movd %%mm1, %2 \n\t"
668 "punpckhdq %%mm1, %%mm1 \n\t"
669 "movd %%mm1, %3 \n\t"
671 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
672 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
673 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
674 "=m" (*(uint32_t*)(dst + 3*dst_stride))
675 : "m" (*(uint32_t*)(src + 0*src_stride)),
676 "m" (*(uint32_t*)(src + 1*src_stride)),
677 "m" (*(uint32_t*)(src + 2*src_stride)),
678 "m" (*(uint32_t*)(src + 3*src_stride))
682 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
683 if(ENABLE_ANY_H263) {
684 const int strength= ff_h263_loop_filter_strength[qscale];
685 uint64_t temp[4] __attribute__ ((aligned(8)));
686 uint8_t *btemp= (uint8_t*)temp;
690 transpose4x4(btemp , src , 8, stride);
691 transpose4x4(btemp+4, src + 4*stride, 8, stride);
693 H263_LOOP_FILTER // 5 3 4 6
699 : "g" (2*strength), "m"(ff_pb_FC)
703 "movq %%mm5, %%mm1 \n\t"
704 "movq %%mm4, %%mm0 \n\t"
705 "punpcklbw %%mm3, %%mm5 \n\t"
706 "punpcklbw %%mm6, %%mm4 \n\t"
707 "punpckhbw %%mm3, %%mm1 \n\t"
708 "punpckhbw %%mm6, %%mm0 \n\t"
709 "movq %%mm5, %%mm3 \n\t"
710 "movq %%mm1, %%mm6 \n\t"
711 "punpcklwd %%mm4, %%mm5 \n\t"
712 "punpcklwd %%mm0, %%mm1 \n\t"
713 "punpckhwd %%mm4, %%mm3 \n\t"
714 "punpckhwd %%mm0, %%mm6 \n\t"
715 "movd %%mm5, (%0) \n\t"
716 "punpckhdq %%mm5, %%mm5 \n\t"
717 "movd %%mm5, (%0,%2) \n\t"
718 "movd %%mm3, (%0,%2,2) \n\t"
719 "punpckhdq %%mm3, %%mm3 \n\t"
720 "movd %%mm3, (%0,%3) \n\t"
721 "movd %%mm1, (%1) \n\t"
722 "punpckhdq %%mm1, %%mm1 \n\t"
723 "movd %%mm1, (%1,%2) \n\t"
724 "movd %%mm6, (%1,%2,2) \n\t"
725 "punpckhdq %%mm6, %%mm6 \n\t"
726 "movd %%mm6, (%1,%3) \n\t"
728 "r" (src + 4*stride),
729 "r" ((long) stride ),
730 "r" ((long)(3*stride))
735 #ifdef CONFIG_ENCODERS
736 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
743 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
744 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
746 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
748 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
749 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
751 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
752 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
753 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
755 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
756 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
758 "pmaddwd %%mm3,%%mm3\n"
759 "pmaddwd %%mm4,%%mm4\n"
761 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
762 pix2^2+pix3^2+pix6^2+pix7^2) */
763 "paddd %%mm3,%%mm4\n"
764 "paddd %%mm2,%%mm7\n"
767 "paddd %%mm4,%%mm7\n"
772 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
773 "paddd %%mm7,%%mm1\n"
775 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
779 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
784 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
785 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
787 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
788 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
789 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
790 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
792 /* todo: mm1-mm2, mm3-mm4 */
793 /* algo: substract mm1 from mm2 with saturation and vice versa */
794 /* OR the results to get absolute difference */
797 "psubusb %%mm2,%%mm1\n"
798 "psubusb %%mm4,%%mm3\n"
799 "psubusb %%mm5,%%mm2\n"
800 "psubusb %%mm6,%%mm4\n"
805 /* now convert to 16-bit vectors so we can square them */
809 "punpckhbw %%mm0,%%mm2\n"
810 "punpckhbw %%mm0,%%mm4\n"
811 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
812 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
814 "pmaddwd %%mm2,%%mm2\n"
815 "pmaddwd %%mm4,%%mm4\n"
816 "pmaddwd %%mm1,%%mm1\n"
817 "pmaddwd %%mm3,%%mm3\n"
819 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
820 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
822 "paddd %%mm2,%%mm1\n"
823 "paddd %%mm4,%%mm3\n"
824 "paddd %%mm1,%%mm7\n"
825 "paddd %%mm3,%%mm7\n"
831 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
832 "paddd %%mm7,%%mm1\n"
834 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
835 : "r" ((long)line_size) , "m" (h)
840 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
844 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
845 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
847 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
848 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
849 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
850 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
852 /* todo: mm1-mm2, mm3-mm4 */
853 /* algo: substract mm1 from mm2 with saturation and vice versa */
854 /* OR the results to get absolute difference */
857 "psubusb %%mm2,%%mm1\n"
858 "psubusb %%mm4,%%mm3\n"
859 "psubusb %%mm5,%%mm2\n"
860 "psubusb %%mm6,%%mm4\n"
865 /* now convert to 16-bit vectors so we can square them */
869 "punpckhbw %%mm0,%%mm2\n"
870 "punpckhbw %%mm0,%%mm4\n"
871 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
872 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
874 "pmaddwd %%mm2,%%mm2\n"
875 "pmaddwd %%mm4,%%mm4\n"
876 "pmaddwd %%mm1,%%mm1\n"
877 "pmaddwd %%mm3,%%mm3\n"
882 "paddd %%mm2,%%mm1\n"
883 "paddd %%mm4,%%mm3\n"
884 "paddd %%mm1,%%mm7\n"
885 "paddd %%mm3,%%mm7\n"
891 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
892 "paddd %%mm7,%%mm1\n"
894 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
895 : "r" ((long)line_size) , "m" (h)
900 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
904 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
905 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
907 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
908 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
909 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
910 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
912 /* todo: mm1-mm2, mm3-mm4 */
913 /* algo: substract mm1 from mm2 with saturation and vice versa */
914 /* OR the results to get absolute difference */
915 "movdqa %%xmm1,%%xmm5\n"
916 "movdqa %%xmm3,%%xmm6\n"
917 "psubusb %%xmm2,%%xmm1\n"
918 "psubusb %%xmm4,%%xmm3\n"
919 "psubusb %%xmm5,%%xmm2\n"
920 "psubusb %%xmm6,%%xmm4\n"
922 "por %%xmm1,%%xmm2\n"
923 "por %%xmm3,%%xmm4\n"
925 /* now convert to 16-bit vectors so we can square them */
926 "movdqa %%xmm2,%%xmm1\n"
927 "movdqa %%xmm4,%%xmm3\n"
929 "punpckhbw %%xmm0,%%xmm2\n"
930 "punpckhbw %%xmm0,%%xmm4\n"
931 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
932 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
934 "pmaddwd %%xmm2,%%xmm2\n"
935 "pmaddwd %%xmm4,%%xmm4\n"
936 "pmaddwd %%xmm1,%%xmm1\n"
937 "pmaddwd %%xmm3,%%xmm3\n"
939 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
940 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
942 "paddd %%xmm2,%%xmm1\n"
943 "paddd %%xmm4,%%xmm3\n"
944 "paddd %%xmm1,%%xmm7\n"
945 "paddd %%xmm3,%%xmm7\n"
950 "movdqa %%xmm7,%%xmm1\n"
951 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
952 "paddd %%xmm1,%%xmm7\n"
953 "movdqa %%xmm7,%%xmm1\n"
954 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
955 "paddd %%xmm1,%%xmm7\n"
957 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
958 : "r" ((long)line_size));
962 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
970 "movq %%mm0, %%mm1\n"
974 "movq %%mm0, %%mm2\n"
975 "movq %%mm1, %%mm3\n"
976 "punpcklbw %%mm7,%%mm0\n"
977 "punpcklbw %%mm7,%%mm1\n"
978 "punpckhbw %%mm7,%%mm2\n"
979 "punpckhbw %%mm7,%%mm3\n"
980 "psubw %%mm1, %%mm0\n"
981 "psubw %%mm3, %%mm2\n"
986 "movq %%mm4, %%mm1\n"
990 "movq %%mm4, %%mm5\n"
991 "movq %%mm1, %%mm3\n"
992 "punpcklbw %%mm7,%%mm4\n"
993 "punpcklbw %%mm7,%%mm1\n"
994 "punpckhbw %%mm7,%%mm5\n"
995 "punpckhbw %%mm7,%%mm3\n"
996 "psubw %%mm1, %%mm4\n"
997 "psubw %%mm3, %%mm5\n"
998 "psubw %%mm4, %%mm0\n"
999 "psubw %%mm5, %%mm2\n"
1000 "pxor %%mm3, %%mm3\n"
1001 "pxor %%mm1, %%mm1\n"
1002 "pcmpgtw %%mm0, %%mm3\n\t"
1003 "pcmpgtw %%mm2, %%mm1\n\t"
1004 "pxor %%mm3, %%mm0\n"
1005 "pxor %%mm1, %%mm2\n"
1006 "psubw %%mm3, %%mm0\n"
1007 "psubw %%mm1, %%mm2\n"
1008 "paddw %%mm0, %%mm2\n"
1009 "paddw %%mm2, %%mm6\n"
1015 "movq %%mm0, %%mm1\n"
1019 "movq %%mm0, %%mm2\n"
1020 "movq %%mm1, %%mm3\n"
1021 "punpcklbw %%mm7,%%mm0\n"
1022 "punpcklbw %%mm7,%%mm1\n"
1023 "punpckhbw %%mm7,%%mm2\n"
1024 "punpckhbw %%mm7,%%mm3\n"
1025 "psubw %%mm1, %%mm0\n"
1026 "psubw %%mm3, %%mm2\n"
1027 "psubw %%mm0, %%mm4\n"
1028 "psubw %%mm2, %%mm5\n"
1029 "pxor %%mm3, %%mm3\n"
1030 "pxor %%mm1, %%mm1\n"
1031 "pcmpgtw %%mm4, %%mm3\n\t"
1032 "pcmpgtw %%mm5, %%mm1\n\t"
1033 "pxor %%mm3, %%mm4\n"
1034 "pxor %%mm1, %%mm5\n"
1035 "psubw %%mm3, %%mm4\n"
1036 "psubw %%mm1, %%mm5\n"
1037 "paddw %%mm4, %%mm5\n"
1038 "paddw %%mm5, %%mm6\n"
1043 "movq %%mm4, %%mm1\n"
1047 "movq %%mm4, %%mm5\n"
1048 "movq %%mm1, %%mm3\n"
1049 "punpcklbw %%mm7,%%mm4\n"
1050 "punpcklbw %%mm7,%%mm1\n"
1051 "punpckhbw %%mm7,%%mm5\n"
1052 "punpckhbw %%mm7,%%mm3\n"
1053 "psubw %%mm1, %%mm4\n"
1054 "psubw %%mm3, %%mm5\n"
1055 "psubw %%mm4, %%mm0\n"
1056 "psubw %%mm5, %%mm2\n"
1057 "pxor %%mm3, %%mm3\n"
1058 "pxor %%mm1, %%mm1\n"
1059 "pcmpgtw %%mm0, %%mm3\n\t"
1060 "pcmpgtw %%mm2, %%mm1\n\t"
1061 "pxor %%mm3, %%mm0\n"
1062 "pxor %%mm1, %%mm2\n"
1063 "psubw %%mm3, %%mm0\n"
1064 "psubw %%mm1, %%mm2\n"
1065 "paddw %%mm0, %%mm2\n"
1066 "paddw %%mm2, %%mm6\n"
1072 "movq %%mm6, %%mm0\n"
1073 "punpcklwd %%mm7,%%mm0\n"
1074 "punpckhwd %%mm7,%%mm6\n"
1075 "paddd %%mm0, %%mm6\n"
1077 "movq %%mm6,%%mm0\n"
1078 "psrlq $32, %%mm6\n"
1079 "paddd %%mm6,%%mm0\n"
1081 : "+r" (pix1), "=r"(tmp)
1082 : "r" ((long)line_size) , "g" (h-2)
1087 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1089 uint8_t * pix= pix1;
1092 "pxor %%mm7,%%mm7\n"
1093 "pxor %%mm6,%%mm6\n"
1096 "movq 1(%0),%%mm1\n"
1097 "movq %%mm0, %%mm2\n"
1098 "movq %%mm1, %%mm3\n"
1099 "punpcklbw %%mm7,%%mm0\n"
1100 "punpcklbw %%mm7,%%mm1\n"
1101 "punpckhbw %%mm7,%%mm2\n"
1102 "punpckhbw %%mm7,%%mm3\n"
1103 "psubw %%mm1, %%mm0\n"
1104 "psubw %%mm3, %%mm2\n"
1109 "movq 1(%0),%%mm1\n"
1110 "movq %%mm4, %%mm5\n"
1111 "movq %%mm1, %%mm3\n"
1112 "punpcklbw %%mm7,%%mm4\n"
1113 "punpcklbw %%mm7,%%mm1\n"
1114 "punpckhbw %%mm7,%%mm5\n"
1115 "punpckhbw %%mm7,%%mm3\n"
1116 "psubw %%mm1, %%mm4\n"
1117 "psubw %%mm3, %%mm5\n"
1118 "psubw %%mm4, %%mm0\n"
1119 "psubw %%mm5, %%mm2\n"
1120 "pxor %%mm3, %%mm3\n"
1121 "pxor %%mm1, %%mm1\n"
1122 "pcmpgtw %%mm0, %%mm3\n\t"
1123 "pcmpgtw %%mm2, %%mm1\n\t"
1124 "pxor %%mm3, %%mm0\n"
1125 "pxor %%mm1, %%mm2\n"
1126 "psubw %%mm3, %%mm0\n"
1127 "psubw %%mm1, %%mm2\n"
1128 "paddw %%mm0, %%mm2\n"
1129 "paddw %%mm2, %%mm6\n"
1135 "movq 1(%0),%%mm1\n"
1136 "movq %%mm0, %%mm2\n"
1137 "movq %%mm1, %%mm3\n"
1138 "punpcklbw %%mm7,%%mm0\n"
1139 "punpcklbw %%mm7,%%mm1\n"
1140 "punpckhbw %%mm7,%%mm2\n"
1141 "punpckhbw %%mm7,%%mm3\n"
1142 "psubw %%mm1, %%mm0\n"
1143 "psubw %%mm3, %%mm2\n"
1144 "psubw %%mm0, %%mm4\n"
1145 "psubw %%mm2, %%mm5\n"
1146 "pxor %%mm3, %%mm3\n"
1147 "pxor %%mm1, %%mm1\n"
1148 "pcmpgtw %%mm4, %%mm3\n\t"
1149 "pcmpgtw %%mm5, %%mm1\n\t"
1150 "pxor %%mm3, %%mm4\n"
1151 "pxor %%mm1, %%mm5\n"
1152 "psubw %%mm3, %%mm4\n"
1153 "psubw %%mm1, %%mm5\n"
1154 "paddw %%mm4, %%mm5\n"
1155 "paddw %%mm5, %%mm6\n"
1160 "movq 1(%0),%%mm1\n"
1161 "movq %%mm4, %%mm5\n"
1162 "movq %%mm1, %%mm3\n"
1163 "punpcklbw %%mm7,%%mm4\n"
1164 "punpcklbw %%mm7,%%mm1\n"
1165 "punpckhbw %%mm7,%%mm5\n"
1166 "punpckhbw %%mm7,%%mm3\n"
1167 "psubw %%mm1, %%mm4\n"
1168 "psubw %%mm3, %%mm5\n"
1169 "psubw %%mm4, %%mm0\n"
1170 "psubw %%mm5, %%mm2\n"
1171 "pxor %%mm3, %%mm3\n"
1172 "pxor %%mm1, %%mm1\n"
1173 "pcmpgtw %%mm0, %%mm3\n\t"
1174 "pcmpgtw %%mm2, %%mm1\n\t"
1175 "pxor %%mm3, %%mm0\n"
1176 "pxor %%mm1, %%mm2\n"
1177 "psubw %%mm3, %%mm0\n"
1178 "psubw %%mm1, %%mm2\n"
1179 "paddw %%mm0, %%mm2\n"
1180 "paddw %%mm2, %%mm6\n"
1186 "movq %%mm6, %%mm0\n"
1187 "punpcklwd %%mm7,%%mm0\n"
1188 "punpckhwd %%mm7,%%mm6\n"
1189 "paddd %%mm0, %%mm6\n"
1191 "movq %%mm6,%%mm0\n"
1192 "psrlq $32, %%mm6\n"
1193 "paddd %%mm6,%%mm0\n"
1195 : "+r" (pix1), "=r"(tmp)
1196 : "r" ((long)line_size) , "g" (h-2)
1198 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1201 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1202 MpegEncContext *c = p;
1205 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1206 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1207 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1209 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1210 else return score1 + FFABS(score2)*8;
1213 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1214 MpegEncContext *c = p;
1215 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1216 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1218 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1219 else return score1 + FFABS(score2)*8;
1222 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1225 assert( (((int)pix) & 7) == 0);
1226 assert((line_size &7) ==0);
1228 #define SUM(in0, in1, out0, out1) \
1229 "movq (%0), %%mm2\n"\
1230 "movq 8(%0), %%mm3\n"\
1232 "movq %%mm2, " #out0 "\n"\
1233 "movq %%mm3, " #out1 "\n"\
1234 "psubusb " #in0 ", %%mm2\n"\
1235 "psubusb " #in1 ", %%mm3\n"\
1236 "psubusb " #out0 ", " #in0 "\n"\
1237 "psubusb " #out1 ", " #in1 "\n"\
1238 "por %%mm2, " #in0 "\n"\
1239 "por %%mm3, " #in1 "\n"\
1240 "movq " #in0 ", %%mm2\n"\
1241 "movq " #in1 ", %%mm3\n"\
1242 "punpcklbw %%mm7, " #in0 "\n"\
1243 "punpcklbw %%mm7, " #in1 "\n"\
1244 "punpckhbw %%mm7, %%mm2\n"\
1245 "punpckhbw %%mm7, %%mm3\n"\
1246 "paddw " #in1 ", " #in0 "\n"\
1247 "paddw %%mm3, %%mm2\n"\
1248 "paddw %%mm2, " #in0 "\n"\
1249 "paddw " #in0 ", %%mm6\n"
1254 "pxor %%mm6,%%mm6\n"
1255 "pxor %%mm7,%%mm7\n"
1257 "movq 8(%0),%%mm1\n"
1260 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1263 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1265 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1270 "movq %%mm6,%%mm0\n"
1271 "psrlq $32, %%mm6\n"
1272 "paddw %%mm6,%%mm0\n"
1273 "movq %%mm0,%%mm6\n"
1274 "psrlq $16, %%mm0\n"
1275 "paddw %%mm6,%%mm0\n"
1277 : "+r" (pix), "=r"(tmp)
1278 : "r" ((long)line_size) , "m" (h)
1280 return tmp & 0xFFFF;
1284 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1287 assert( (((int)pix) & 7) == 0);
1288 assert((line_size &7) ==0);
1290 #define SUM(in0, in1, out0, out1) \
1291 "movq (%0), " #out0 "\n"\
1292 "movq 8(%0), " #out1 "\n"\
1294 "psadbw " #out0 ", " #in0 "\n"\
1295 "psadbw " #out1 ", " #in1 "\n"\
1296 "paddw " #in1 ", " #in0 "\n"\
1297 "paddw " #in0 ", %%mm6\n"
1301 "pxor %%mm6,%%mm6\n"
1302 "pxor %%mm7,%%mm7\n"
1304 "movq 8(%0),%%mm1\n"
1307 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1310 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1312 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1318 : "+r" (pix), "=r"(tmp)
1319 : "r" ((long)line_size) , "m" (h)
1325 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1328 assert( (((int)pix1) & 7) == 0);
1329 assert( (((int)pix2) & 7) == 0);
1330 assert((line_size &7) ==0);
1332 #define SUM(in0, in1, out0, out1) \
1333 "movq (%0),%%mm2\n"\
1334 "movq (%1)," #out0 "\n"\
1335 "movq 8(%0),%%mm3\n"\
1336 "movq 8(%1)," #out1 "\n"\
1339 "psubb " #out0 ", %%mm2\n"\
1340 "psubb " #out1 ", %%mm3\n"\
1341 "pxor %%mm7, %%mm2\n"\
1342 "pxor %%mm7, %%mm3\n"\
1343 "movq %%mm2, " #out0 "\n"\
1344 "movq %%mm3, " #out1 "\n"\
1345 "psubusb " #in0 ", %%mm2\n"\
1346 "psubusb " #in1 ", %%mm3\n"\
1347 "psubusb " #out0 ", " #in0 "\n"\
1348 "psubusb " #out1 ", " #in1 "\n"\
1349 "por %%mm2, " #in0 "\n"\
1350 "por %%mm3, " #in1 "\n"\
1351 "movq " #in0 ", %%mm2\n"\
1352 "movq " #in1 ", %%mm3\n"\
1353 "punpcklbw %%mm7, " #in0 "\n"\
1354 "punpcklbw %%mm7, " #in1 "\n"\
1355 "punpckhbw %%mm7, %%mm2\n"\
1356 "punpckhbw %%mm7, %%mm3\n"\
1357 "paddw " #in1 ", " #in0 "\n"\
1358 "paddw %%mm3, %%mm2\n"\
1359 "paddw %%mm2, " #in0 "\n"\
1360 "paddw " #in0 ", %%mm6\n"
1365 "pxor %%mm6,%%mm6\n"
1366 "pcmpeqw %%mm7,%%mm7\n"
1367 "psllw $15, %%mm7\n"
1368 "packsswb %%mm7, %%mm7\n"
1371 "movq 8(%0),%%mm1\n"
1372 "movq 8(%1),%%mm3\n"
1376 "psubb %%mm2, %%mm0\n"
1377 "psubb %%mm3, %%mm1\n"
1378 "pxor %%mm7, %%mm0\n"
1379 "pxor %%mm7, %%mm1\n"
1380 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1383 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1385 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1390 "movq %%mm6,%%mm0\n"
1391 "psrlq $32, %%mm6\n"
1392 "paddw %%mm6,%%mm0\n"
1393 "movq %%mm0,%%mm6\n"
1394 "psrlq $16, %%mm0\n"
1395 "paddw %%mm6,%%mm0\n"
1397 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1398 : "r" ((long)line_size) , "m" (h)
1400 return tmp & 0x7FFF;
1404 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1407 assert( (((int)pix1) & 7) == 0);
1408 assert( (((int)pix2) & 7) == 0);
1409 assert((line_size &7) ==0);
1411 #define SUM(in0, in1, out0, out1) \
1412 "movq (%0)," #out0 "\n"\
1413 "movq (%1),%%mm2\n"\
1414 "movq 8(%0)," #out1 "\n"\
1415 "movq 8(%1),%%mm3\n"\
1418 "psubb %%mm2, " #out0 "\n"\
1419 "psubb %%mm3, " #out1 "\n"\
1420 "pxor %%mm7, " #out0 "\n"\
1421 "pxor %%mm7, " #out1 "\n"\
1422 "psadbw " #out0 ", " #in0 "\n"\
1423 "psadbw " #out1 ", " #in1 "\n"\
1424 "paddw " #in1 ", " #in0 "\n"\
1425 "paddw " #in0 ", %%mm6\n"
1429 "pxor %%mm6,%%mm6\n"
1430 "pcmpeqw %%mm7,%%mm7\n"
1431 "psllw $15, %%mm7\n"
1432 "packsswb %%mm7, %%mm7\n"
1435 "movq 8(%0),%%mm1\n"
1436 "movq 8(%1),%%mm3\n"
1440 "psubb %%mm2, %%mm0\n"
1441 "psubb %%mm3, %%mm1\n"
1442 "pxor %%mm7, %%mm0\n"
1443 "pxor %%mm7, %%mm1\n"
1444 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1447 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1449 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1455 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1456 : "r" ((long)line_size) , "m" (h)
1462 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1466 "movq (%2, %0), %%mm0 \n\t"
1467 "movq (%1, %0), %%mm1 \n\t"
1468 "psubb %%mm0, %%mm1 \n\t"
1469 "movq %%mm1, (%3, %0) \n\t"
1470 "movq 8(%2, %0), %%mm0 \n\t"
1471 "movq 8(%1, %0), %%mm1 \n\t"
1472 "psubb %%mm0, %%mm1 \n\t"
1473 "movq %%mm1, 8(%3, %0) \n\t"
1478 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1481 dst[i+0] = src1[i+0]-src2[i+0];
1484 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1490 "movq -1(%1, %0), %%mm0 \n\t" // LT
1491 "movq (%1, %0), %%mm1 \n\t" // T
1492 "movq -1(%2, %0), %%mm2 \n\t" // L
1493 "movq (%2, %0), %%mm3 \n\t" // X
1494 "movq %%mm2, %%mm4 \n\t" // L
1495 "psubb %%mm0, %%mm2 \n\t"
1496 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1497 "movq %%mm4, %%mm5 \n\t" // L
1498 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1499 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1500 "pminub %%mm2, %%mm4 \n\t"
1501 "pmaxub %%mm1, %%mm4 \n\t"
1502 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1503 "movq %%mm3, (%3, %0) \n\t"
1508 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1514 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1516 *left_top= src1[w-1];
1520 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1521 "mov"#m" "#p1", "#a" \n\t"\
1522 "mov"#m" "#p2", "#t" \n\t"\
1523 "punpcklbw "#a", "#t" \n\t"\
1524 "punpcklbw "#a", "#a" \n\t"\
1525 "psubw "#t", "#a" \n\t"\
1527 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1528 uint8_t *p1b=p1, *p2b=p2;\
1530 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1531 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1532 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1535 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1536 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1537 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1538 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1539 "mov"#m1" "#mm"0, %0 \n\t"\
1540 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1541 "mov"#m1" %0, "#mm"0 \n\t"\
1542 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1543 : "r"((long)stride), "r"((long)stride*3)\
1546 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1548 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1549 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1552 // permutes 01234567 -> 05736421
1553 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1554 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1555 SBUTTERFLY(c,d,b,wd,dqa)\
1556 SBUTTERFLY(e,f,d,wd,dqa)\
1557 SBUTTERFLY(g,h,f,wd,dqa)\
1558 SBUTTERFLY(a,c,h,dq,dqa)\
1559 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1560 SBUTTERFLY(e,g,b,dq,dqa)\
1561 SBUTTERFLY(d,f,g,dq,dqa)\
1562 SBUTTERFLY(a,e,f,qdq,dqa)\
1563 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1564 SBUTTERFLY(h,b,d,qdq,dqa)\
1565 SBUTTERFLY(c,g,b,qdq,dqa)\
1566 "movdqa %%xmm8, "#g" \n\t"
1568 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1569 "movdqa "#h", "#t" \n\t"\
1570 SBUTTERFLY(a,b,h,wd,dqa)\
1571 "movdqa "#h", 16"#t" \n\t"\
1572 "movdqa "#t", "#h" \n\t"\
1573 SBUTTERFLY(c,d,b,wd,dqa)\
1574 SBUTTERFLY(e,f,d,wd,dqa)\
1575 SBUTTERFLY(g,h,f,wd,dqa)\
1576 SBUTTERFLY(a,c,h,dq,dqa)\
1577 "movdqa "#h", "#t" \n\t"\
1578 "movdqa 16"#t", "#h" \n\t"\
1579 SBUTTERFLY(h,b,c,dq,dqa)\
1580 SBUTTERFLY(e,g,b,dq,dqa)\
1581 SBUTTERFLY(d,f,g,dq,dqa)\
1582 SBUTTERFLY(a,e,f,qdq,dqa)\
1583 SBUTTERFLY(h,d,e,qdq,dqa)\
1584 "movdqa "#h", 16"#t" \n\t"\
1585 "movdqa "#t", "#h" \n\t"\
1586 SBUTTERFLY(h,b,d,qdq,dqa)\
1587 SBUTTERFLY(c,g,b,qdq,dqa)\
1588 "movdqa 16"#t", "#g" \n\t"
1591 #define LBUTTERFLY2(a1,b1,a2,b2)\
1592 "paddw " #b1 ", " #a1 " \n\t"\
1593 "paddw " #b2 ", " #a2 " \n\t"\
1594 "paddw " #b1 ", " #b1 " \n\t"\
1595 "paddw " #b2 ", " #b2 " \n\t"\
1596 "psubw " #a1 ", " #b1 " \n\t"\
1597 "psubw " #a2 ", " #b2 " \n\t"
1599 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1600 LBUTTERFLY2(m0, m1, m2, m3)\
1601 LBUTTERFLY2(m4, m5, m6, m7)\
1602 LBUTTERFLY2(m0, m2, m1, m3)\
1603 LBUTTERFLY2(m4, m6, m5, m7)\
1604 LBUTTERFLY2(m0, m4, m1, m5)\
1605 LBUTTERFLY2(m2, m6, m3, m7)\
1607 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1609 #define MMABS_MMX(a,z)\
1610 "pxor " #z ", " #z " \n\t"\
1611 "pcmpgtw " #a ", " #z " \n\t"\
1612 "pxor " #z ", " #a " \n\t"\
1613 "psubw " #z ", " #a " \n\t"
1615 #define MMABS_MMX2(a,z)\
1616 "pxor " #z ", " #z " \n\t"\
1617 "psubw " #a ", " #z " \n\t"\
1618 "pmaxsw " #z ", " #a " \n\t"
1620 #define MMABS_SSSE3(a,z)\
1621 "pabsw " #a ", " #a " \n\t"
1623 #define MMABS_SUM(a,z, sum)\
1625 "paddusw " #a ", " #sum " \n\t"
1627 #define MMABS_SUM_8x8_NOSPILL\
1628 MMABS(%%xmm0, %%xmm8)\
1629 MMABS(%%xmm1, %%xmm9)\
1630 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1631 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1632 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1633 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1634 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1635 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1636 "paddusw %%xmm1, %%xmm0 \n\t"
1639 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1641 #define MMABS_SUM_8x8_SSE2\
1642 "movdqa %%xmm7, (%1) \n\t"\
1643 MMABS(%%xmm0, %%xmm7)\
1644 MMABS(%%xmm1, %%xmm7)\
1645 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1646 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1647 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1648 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1649 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1650 "movdqa (%1), %%xmm2 \n\t"\
1651 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1652 "paddusw %%xmm1, %%xmm0 \n\t"
1655 #define LOAD4(o, a, b, c, d)\
1656 "movq "#o"(%1), "#a" \n\t"\
1657 "movq "#o"+8(%1), "#b" \n\t"\
1658 "movq "#o"+16(%1), "#c" \n\t"\
1659 "movq "#o"+24(%1), "#d" \n\t"\
1661 #define STORE4(o, a, b, c, d)\
1662 "movq "#a", "#o"(%1) \n\t"\
1663 "movq "#b", "#o"+8(%1) \n\t"\
1664 "movq "#c", "#o"+16(%1) \n\t"\
1665 "movq "#d", "#o"+24(%1) \n\t"\
1667 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1668 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1669 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1670 #define HSUM_MMX(a, t, dst)\
1671 "movq "#a", "#t" \n\t"\
1672 "psrlq $32, "#a" \n\t"\
1673 "paddusw "#t", "#a" \n\t"\
1674 "movq "#a", "#t" \n\t"\
1675 "psrlq $16, "#a" \n\t"\
1676 "paddusw "#t", "#a" \n\t"\
1677 "movd "#a", "#dst" \n\t"\
1679 #define HSUM_MMX2(a, t, dst)\
1680 "pshufw $0x0E, "#a", "#t" \n\t"\
1681 "paddusw "#t", "#a" \n\t"\
1682 "pshufw $0x01, "#a", "#t" \n\t"\
1683 "paddusw "#t", "#a" \n\t"\
1684 "movd "#a", "#dst" \n\t"\
1686 #define HSUM_SSE2(a, t, dst)\
1687 "movhlps "#a", "#t" \n\t"\
1688 "paddusw "#t", "#a" \n\t"\
1689 "pshuflw $0x0E, "#a", "#t" \n\t"\
1690 "paddusw "#t", "#a" \n\t"\
1691 "pshuflw $0x01, "#a", "#t" \n\t"\
1692 "paddusw "#t", "#a" \n\t"\
1693 "movd "#a", "#dst" \n\t"\
1695 #define HADAMARD8_DIFF_MMX(cpu) \
1696 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1697 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1702 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1707 "movq %%mm7, 96(%1) \n\t"\
1709 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1710 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1712 "movq 96(%1), %%mm7 \n\t"\
1713 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1714 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1720 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1725 "movq %%mm7, 96(%1) \n\t"\
1727 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1728 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1730 "movq 96(%1), %%mm7 \n\t"\
1731 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1732 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1733 "movq %%mm6, %%mm7 \n\t"\
1734 "movq %%mm0, %%mm6 \n\t"\
1736 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1739 "movq %%mm7, 64(%1) \n\t"\
1740 MMABS(%%mm0, %%mm7)\
1741 MMABS(%%mm1, %%mm7)\
1742 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1743 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1744 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1745 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1746 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1747 "movq 64(%1), %%mm2 \n\t"\
1748 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1749 "paddusw %%mm1, %%mm0 \n\t"\
1750 "movq %%mm0, 64(%1) \n\t"\
1752 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1753 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1756 "movq %%mm7, (%1) \n\t"\
1757 MMABS(%%mm0, %%mm7)\
1758 MMABS(%%mm1, %%mm7)\
1759 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1760 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1761 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1762 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1763 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1764 "movq (%1), %%mm2 \n\t"\
1765 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1766 "paddusw 64(%1), %%mm0 \n\t"\
1767 "paddusw %%mm1, %%mm0 \n\t"\
1769 HSUM(%%mm0, %%mm1, %0)\
1776 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1778 #define HADAMARD8_DIFF_SSE2(cpu) \
1779 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1780 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1785 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1788 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1789 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1790 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1792 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1798 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1800 #define MMABS(a,z) MMABS_MMX(a,z)
1801 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1802 HADAMARD8_DIFF_MMX(mmx)
1806 #define MMABS(a,z) MMABS_MMX2(a,z)
1807 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1808 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1809 HADAMARD8_DIFF_MMX(mmx2)
1810 HADAMARD8_DIFF_SSE2(sse2)
1812 #undef MMABS_SUM_8x8
1816 #define MMABS(a,z) MMABS_SSSE3(a,z)
1817 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1818 HADAMARD8_DIFF_SSE2(ssse3)
1820 #undef MMABS_SUM_8x8
1823 #define DCT_SAD4(m,mm,o)\
1824 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1825 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1826 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1827 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1828 MMABS_SUM(mm##2, mm##6, mm##0)\
1829 MMABS_SUM(mm##3, mm##7, mm##1)\
1830 MMABS_SUM(mm##4, mm##6, mm##0)\
1831 MMABS_SUM(mm##5, mm##7, mm##1)\
1833 #define DCT_SAD_MMX\
1834 "pxor %%mm0, %%mm0 \n\t"\
1835 "pxor %%mm1, %%mm1 \n\t"\
1836 DCT_SAD4(q, %%mm, 0)\
1837 DCT_SAD4(q, %%mm, 8)\
1838 DCT_SAD4(q, %%mm, 64)\
1839 DCT_SAD4(q, %%mm, 72)\
1840 "paddusw %%mm1, %%mm0 \n\t"\
1841 HSUM(%%mm0, %%mm1, %0)
1843 #define DCT_SAD_SSE2\
1844 "pxor %%xmm0, %%xmm0 \n\t"\
1845 "pxor %%xmm1, %%xmm1 \n\t"\
1846 DCT_SAD4(dqa, %%xmm, 0)\
1847 DCT_SAD4(dqa, %%xmm, 64)\
1848 "paddusw %%xmm1, %%xmm0 \n\t"\
1849 HSUM(%%xmm0, %%xmm1, %0)
1851 #define DCT_SAD_FUNC(cpu) \
1852 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1862 #define DCT_SAD DCT_SAD_MMX
1863 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1864 #define MMABS(a,z) MMABS_MMX(a,z)
1869 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1870 #define MMABS(a,z) MMABS_MMX2(a,z)
1875 #define DCT_SAD DCT_SAD_SSE2
1876 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1881 #define MMABS(a,z) MMABS_SSSE3(a,z)
1888 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1892 "pxor %%mm4, %%mm4 \n"
1895 "movq (%2,%0), %%mm2 \n"
1896 "movq (%3,%0,2), %%mm0 \n"
1897 "movq 8(%3,%0,2), %%mm1 \n"
1898 "punpckhbw %%mm2, %%mm3 \n"
1899 "punpcklbw %%mm2, %%mm2 \n"
1900 "psraw $8, %%mm3 \n"
1901 "psraw $8, %%mm2 \n"
1902 "psubw %%mm3, %%mm1 \n"
1903 "psubw %%mm2, %%mm0 \n"
1904 "pmaddwd %%mm1, %%mm1 \n"
1905 "pmaddwd %%mm0, %%mm0 \n"
1906 "paddd %%mm1, %%mm4 \n"
1907 "paddd %%mm0, %%mm4 \n"
1909 "movq %%mm4, %%mm3 \n"
1910 "psrlq $32, %%mm3 \n"
1911 "paddd %%mm3, %%mm4 \n"
1914 :"r"(pix1), "r"(pix2)
1919 #endif //CONFIG_ENCODERS
1921 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1922 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1924 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1925 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1926 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1927 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1928 "movq "#in7", " #m3 " \n\t" /* d */\
1929 "movq "#in0", %%mm5 \n\t" /* D */\
1930 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1931 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1932 "movq "#in1", %%mm5 \n\t" /* C */\
1933 "movq "#in2", %%mm6 \n\t" /* B */\
1934 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1935 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1936 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1937 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1938 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1939 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1940 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1941 "psraw $5, %%mm5 \n\t"\
1942 "packuswb %%mm5, %%mm5 \n\t"\
1943 OP(%%mm5, out, %%mm7, d)
1945 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1946 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1950 "pxor %%mm7, %%mm7 \n\t"\
1952 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1953 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1954 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1955 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1956 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1957 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1958 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1959 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1960 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1961 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1962 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1963 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1964 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1965 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1966 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1967 "paddw %%mm3, %%mm5 \n\t" /* b */\
1968 "paddw %%mm2, %%mm6 \n\t" /* c */\
1969 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1970 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1971 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1972 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1973 "paddw %%mm4, %%mm0 \n\t" /* a */\
1974 "paddw %%mm1, %%mm5 \n\t" /* d */\
1975 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1976 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1977 "paddw %6, %%mm6 \n\t"\
1978 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1979 "psraw $5, %%mm0 \n\t"\
1980 "movq %%mm0, %5 \n\t"\
1981 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1983 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1984 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1985 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1986 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1987 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1988 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1989 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1990 "paddw %%mm0, %%mm2 \n\t" /* b */\
1991 "paddw %%mm5, %%mm3 \n\t" /* c */\
1992 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1993 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1994 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1995 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1996 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1997 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1998 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1999 "paddw %%mm2, %%mm1 \n\t" /* a */\
2000 "paddw %%mm6, %%mm4 \n\t" /* d */\
2001 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2002 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
2003 "paddw %6, %%mm1 \n\t"\
2004 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
2005 "psraw $5, %%mm3 \n\t"\
2006 "movq %5, %%mm1 \n\t"\
2007 "packuswb %%mm3, %%mm1 \n\t"\
2008 OP_MMX2(%%mm1, (%1),%%mm4, q)\
2009 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2011 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
2012 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
2013 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
2014 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2015 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2016 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2017 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2018 "paddw %%mm1, %%mm5 \n\t" /* b */\
2019 "paddw %%mm4, %%mm0 \n\t" /* c */\
2020 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2021 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2022 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2023 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2024 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2025 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2026 "paddw %%mm3, %%mm2 \n\t" /* d */\
2027 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2028 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2029 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2030 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2031 "paddw %%mm2, %%mm6 \n\t" /* a */\
2032 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2033 "paddw %6, %%mm0 \n\t"\
2034 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2035 "psraw $5, %%mm0 \n\t"\
2036 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2038 "paddw %%mm5, %%mm3 \n\t" /* a */\
2039 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2040 "paddw %%mm4, %%mm6 \n\t" /* b */\
2041 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2042 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2043 "paddw %%mm1, %%mm4 \n\t" /* c */\
2044 "paddw %%mm2, %%mm5 \n\t" /* d */\
2045 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2046 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2047 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2048 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2049 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2050 "paddw %6, %%mm4 \n\t"\
2051 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2052 "psraw $5, %%mm4 \n\t"\
2053 "packuswb %%mm4, %%mm0 \n\t"\
2054 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2060 : "+a"(src), "+c"(dst), "+m"(h)\
2061 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2066 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2069 /* quick HACK, XXX FIXME MUST be optimized */\
2072 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2073 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2074 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2075 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2076 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2077 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2078 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2079 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2080 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2081 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2082 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2083 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2084 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2085 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2086 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2087 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2089 "movq (%0), %%mm0 \n\t"\
2090 "movq 8(%0), %%mm1 \n\t"\
2091 "paddw %2, %%mm0 \n\t"\
2092 "paddw %2, %%mm1 \n\t"\
2093 "psraw $5, %%mm0 \n\t"\
2094 "psraw $5, %%mm1 \n\t"\
2095 "packuswb %%mm1, %%mm0 \n\t"\
2096 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2097 "movq 16(%0), %%mm0 \n\t"\
2098 "movq 24(%0), %%mm1 \n\t"\
2099 "paddw %2, %%mm0 \n\t"\
2100 "paddw %2, %%mm1 \n\t"\
2101 "psraw $5, %%mm0 \n\t"\
2102 "psraw $5, %%mm1 \n\t"\
2103 "packuswb %%mm1, %%mm0 \n\t"\
2104 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2105 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2113 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2117 "pxor %%mm7, %%mm7 \n\t"\
2119 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2120 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2121 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2122 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2123 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2124 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2125 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2126 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2127 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2128 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2129 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2130 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2131 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2132 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2133 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2134 "paddw %%mm3, %%mm5 \n\t" /* b */\
2135 "paddw %%mm2, %%mm6 \n\t" /* c */\
2136 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2137 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2138 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2139 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2140 "paddw %%mm4, %%mm0 \n\t" /* a */\
2141 "paddw %%mm1, %%mm5 \n\t" /* d */\
2142 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2143 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2144 "paddw %6, %%mm6 \n\t"\
2145 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2146 "psraw $5, %%mm0 \n\t"\
2147 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2149 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2150 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2151 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2152 "paddw %%mm5, %%mm1 \n\t" /* a */\
2153 "paddw %%mm6, %%mm2 \n\t" /* b */\
2154 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2155 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2156 "paddw %%mm6, %%mm3 \n\t" /* c */\
2157 "paddw %%mm5, %%mm4 \n\t" /* d */\
2158 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2159 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2160 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2161 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2162 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2163 "paddw %6, %%mm1 \n\t"\
2164 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2165 "psraw $5, %%mm3 \n\t"\
2166 "packuswb %%mm3, %%mm0 \n\t"\
2167 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2173 : "+a"(src), "+c"(dst), "+m"(h)\
2174 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2179 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2182 /* quick HACK, XXX FIXME MUST be optimized */\
2185 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2186 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2187 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2188 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2189 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2190 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2191 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2192 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2194 "movq (%0), %%mm0 \n\t"\
2195 "movq 8(%0), %%mm1 \n\t"\
2196 "paddw %2, %%mm0 \n\t"\
2197 "paddw %2, %%mm1 \n\t"\
2198 "psraw $5, %%mm0 \n\t"\
2199 "psraw $5, %%mm1 \n\t"\
2200 "packuswb %%mm1, %%mm0 \n\t"\
2201 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2202 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2210 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2212 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2213 uint64_t temp[17*4];\
2214 uint64_t *temp_ptr= temp;\
2219 "pxor %%mm7, %%mm7 \n\t"\
2221 "movq (%0), %%mm0 \n\t"\
2222 "movq (%0), %%mm1 \n\t"\
2223 "movq 8(%0), %%mm2 \n\t"\
2224 "movq 8(%0), %%mm3 \n\t"\
2225 "punpcklbw %%mm7, %%mm0 \n\t"\
2226 "punpckhbw %%mm7, %%mm1 \n\t"\
2227 "punpcklbw %%mm7, %%mm2 \n\t"\
2228 "punpckhbw %%mm7, %%mm3 \n\t"\
2229 "movq %%mm0, (%1) \n\t"\
2230 "movq %%mm1, 17*8(%1) \n\t"\
2231 "movq %%mm2, 2*17*8(%1) \n\t"\
2232 "movq %%mm3, 3*17*8(%1) \n\t"\
2237 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2238 : "r" ((long)srcStride)\
2245 /*FIXME reorder for speed */\
2247 /*"pxor %%mm7, %%mm7 \n\t"*/\
2249 "movq (%0), %%mm0 \n\t"\
2250 "movq 8(%0), %%mm1 \n\t"\
2251 "movq 16(%0), %%mm2 \n\t"\
2252 "movq 24(%0), %%mm3 \n\t"\
2253 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2254 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2256 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2258 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2260 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2261 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2263 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2264 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2266 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2267 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2269 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2270 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2272 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2274 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2276 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2277 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2279 "add $136, %0 \n\t"\
2284 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2285 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2290 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2291 uint64_t temp[9*2];\
2292 uint64_t *temp_ptr= temp;\
2297 "pxor %%mm7, %%mm7 \n\t"\
2299 "movq (%0), %%mm0 \n\t"\
2300 "movq (%0), %%mm1 \n\t"\
2301 "punpcklbw %%mm7, %%mm0 \n\t"\
2302 "punpckhbw %%mm7, %%mm1 \n\t"\
2303 "movq %%mm0, (%1) \n\t"\
2304 "movq %%mm1, 9*8(%1) \n\t"\
2309 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2310 : "r" ((long)srcStride)\
2317 /*FIXME reorder for speed */\
2319 /*"pxor %%mm7, %%mm7 \n\t"*/\
2321 "movq (%0), %%mm0 \n\t"\
2322 "movq 8(%0), %%mm1 \n\t"\
2323 "movq 16(%0), %%mm2 \n\t"\
2324 "movq 24(%0), %%mm3 \n\t"\
2325 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2326 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2328 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2330 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2332 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2334 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2336 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2337 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2344 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2345 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2350 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2351 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2354 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2356 uint8_t * const half= (uint8_t*)temp;\
2357 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2358 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2361 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2362 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2365 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2367 uint8_t * const half= (uint8_t*)temp;\
2368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2369 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2372 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2374 uint8_t * const half= (uint8_t*)temp;\
2375 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2376 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2379 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2380 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2383 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2385 uint8_t * const half= (uint8_t*)temp;\
2386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2387 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2389 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2390 uint64_t half[8 + 9];\
2391 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2392 uint8_t * const halfHV= ((uint8_t*)half);\
2393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2394 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2396 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2398 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2399 uint64_t half[8 + 9];\
2400 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2401 uint8_t * const halfHV= ((uint8_t*)half);\
2402 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2403 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2404 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2405 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2407 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2408 uint64_t half[8 + 9];\
2409 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2410 uint8_t * const halfHV= ((uint8_t*)half);\
2411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2412 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2414 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2416 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2417 uint64_t half[8 + 9];\
2418 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2419 uint8_t * const halfHV= ((uint8_t*)half);\
2420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2421 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2422 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2423 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2425 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2426 uint64_t half[8 + 9];\
2427 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2428 uint8_t * const halfHV= ((uint8_t*)half);\
2429 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2430 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2431 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2433 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2434 uint64_t half[8 + 9];\
2435 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2436 uint8_t * const halfHV= ((uint8_t*)half);\
2437 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2438 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2439 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2441 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2442 uint64_t half[8 + 9];\
2443 uint8_t * const halfH= ((uint8_t*)half);\
2444 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2445 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2446 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2448 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2449 uint64_t half[8 + 9];\
2450 uint8_t * const halfH= ((uint8_t*)half);\
2451 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2452 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2453 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2455 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2457 uint8_t * const halfH= ((uint8_t*)half);\
2458 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2459 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2461 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2462 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2465 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2467 uint8_t * const half= (uint8_t*)temp;\
2468 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2469 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2472 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2473 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2476 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2478 uint8_t * const half= (uint8_t*)temp;\
2479 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2480 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2483 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2485 uint8_t * const half= (uint8_t*)temp;\
2486 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2487 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2490 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2491 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2494 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2496 uint8_t * const half= (uint8_t*)temp;\
2497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2498 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2500 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2501 uint64_t half[16*2 + 17*2];\
2502 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2503 uint8_t * const halfHV= ((uint8_t*)half);\
2504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2505 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2509 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2510 uint64_t half[16*2 + 17*2];\
2511 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2512 uint8_t * const halfHV= ((uint8_t*)half);\
2513 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2514 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2515 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2516 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2518 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2519 uint64_t half[16*2 + 17*2];\
2520 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2521 uint8_t * const halfHV= ((uint8_t*)half);\
2522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2523 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2525 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2527 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2528 uint64_t half[16*2 + 17*2];\
2529 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2530 uint8_t * const halfHV= ((uint8_t*)half);\
2531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2532 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2533 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2534 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2536 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2537 uint64_t half[16*2 + 17*2];\
2538 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2539 uint8_t * const halfHV= ((uint8_t*)half);\
2540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2541 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2542 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2544 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2545 uint64_t half[16*2 + 17*2];\
2546 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2547 uint8_t * const halfHV= ((uint8_t*)half);\
2548 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2549 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2550 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2552 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2553 uint64_t half[17*2];\
2554 uint8_t * const halfH= ((uint8_t*)half);\
2555 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2556 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2557 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2559 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2560 uint64_t half[17*2];\
2561 uint8_t * const halfH= ((uint8_t*)half);\
2562 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2563 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2564 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2566 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2567 uint64_t half[17*2];\
2568 uint8_t * const halfH= ((uint8_t*)half);\
2569 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2570 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2573 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2574 #define AVG_3DNOW_OP(a,b,temp, size) \
2575 "mov" #size " " #b ", " #temp " \n\t"\
2576 "pavgusb " #temp ", " #a " \n\t"\
2577 "mov" #size " " #a ", " #b " \n\t"
2578 #define AVG_MMX2_OP(a,b,temp, size) \
2579 "mov" #size " " #b ", " #temp " \n\t"\
2580 "pavgb " #temp ", " #a " \n\t"\
2581 "mov" #size " " #a ", " #b " \n\t"
2583 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2584 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2585 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2586 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2587 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2588 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2589 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2590 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2591 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2593 /***********************************/
2594 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2596 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2597 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2598 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2600 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2601 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2602 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2605 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2608 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2609 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2610 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2611 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2612 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2613 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2614 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2615 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2616 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2618 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2619 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2621 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2622 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2623 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2624 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2625 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2626 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2630 QPEL_2TAP(put_, 16, mmx2)
2631 QPEL_2TAP(avg_, 16, mmx2)
2632 QPEL_2TAP(put_, 8, mmx2)
2633 QPEL_2TAP(avg_, 8, mmx2)
2634 QPEL_2TAP(put_, 16, 3dnow)
2635 QPEL_2TAP(avg_, 16, 3dnow)
2636 QPEL_2TAP(put_, 8, 3dnow)
2637 QPEL_2TAP(avg_, 8, 3dnow)
2641 static void just_return() { return; }
2644 #define SET_QPEL_FUNC(postfix1, postfix2) \
2645 c->put_ ## postfix1 = put_ ## postfix2;\
2646 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2647 c->avg_ ## postfix1 = avg_ ## postfix2;
2649 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2650 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2652 const int ix = ox>>(16+shift);
2653 const int iy = oy>>(16+shift);
2654 const int oxs = ox>>4;
2655 const int oys = oy>>4;
2656 const int dxxs = dxx>>4;
2657 const int dxys = dxy>>4;
2658 const int dyxs = dyx>>4;
2659 const int dyys = dyy>>4;
2660 const uint16_t r4[4] = {r,r,r,r};
2661 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2662 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2663 const uint64_t shift2 = 2*shift;
2664 uint8_t edge_buf[(h+1)*stride];
2667 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2668 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2669 const int dxh = dxy*(h-1);
2670 const int dyw = dyx*(w-1);
2671 if( // non-constant fullpel offset (3% of blocks)
2672 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2673 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2674 // uses more than 16 bits of subpel mv (only at huge resolution)
2675 || (dxx|dxy|dyx|dyy)&15 )
2677 //FIXME could still use mmx for some of the rows
2678 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2682 src += ix + iy*stride;
2683 if( (unsigned)ix >= width-w ||
2684 (unsigned)iy >= height-h )
2686 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2691 "movd %0, %%mm6 \n\t"
2692 "pxor %%mm7, %%mm7 \n\t"
2693 "punpcklwd %%mm6, %%mm6 \n\t"
2694 "punpcklwd %%mm6, %%mm6 \n\t"
2698 for(x=0; x<w; x+=4){
2699 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2700 oxs - dxys + dxxs*(x+1),
2701 oxs - dxys + dxxs*(x+2),
2702 oxs - dxys + dxxs*(x+3) };
2703 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2704 oys - dyys + dyxs*(x+1),
2705 oys - dyys + dyxs*(x+2),
2706 oys - dyys + dyxs*(x+3) };
2710 "movq %0, %%mm4 \n\t"
2711 "movq %1, %%mm5 \n\t"
2712 "paddw %2, %%mm4 \n\t"
2713 "paddw %3, %%mm5 \n\t"
2714 "movq %%mm4, %0 \n\t"
2715 "movq %%mm5, %1 \n\t"
2716 "psrlw $12, %%mm4 \n\t"
2717 "psrlw $12, %%mm5 \n\t"
2718 : "+m"(*dx4), "+m"(*dy4)
2719 : "m"(*dxy4), "m"(*dyy4)
2723 "movq %%mm6, %%mm2 \n\t"
2724 "movq %%mm6, %%mm1 \n\t"
2725 "psubw %%mm4, %%mm2 \n\t"
2726 "psubw %%mm5, %%mm1 \n\t"
2727 "movq %%mm2, %%mm0 \n\t"
2728 "movq %%mm4, %%mm3 \n\t"
2729 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2730 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2731 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2732 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2734 "movd %4, %%mm5 \n\t"
2735 "movd %3, %%mm4 \n\t"
2736 "punpcklbw %%mm7, %%mm5 \n\t"
2737 "punpcklbw %%mm7, %%mm4 \n\t"
2738 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2739 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2741 "movd %2, %%mm5 \n\t"
2742 "movd %1, %%mm4 \n\t"
2743 "punpcklbw %%mm7, %%mm5 \n\t"
2744 "punpcklbw %%mm7, %%mm4 \n\t"
2745 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2746 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2747 "paddw %5, %%mm1 \n\t"
2748 "paddw %%mm3, %%mm2 \n\t"
2749 "paddw %%mm1, %%mm0 \n\t"
2750 "paddw %%mm2, %%mm0 \n\t"
2752 "psrlw %6, %%mm0 \n\t"
2753 "packuswb %%mm0, %%mm0 \n\t"
2754 "movd %%mm0, %0 \n\t"
2756 : "=m"(dst[x+y*stride])
2757 : "m"(src[0]), "m"(src[1]),
2758 "m"(src[stride]), "m"(src[stride+1]),
2759 "m"(*r4), "m"(shift2)
2767 #ifdef CONFIG_ENCODERS
2769 #define PHADDD(a, t)\
2770 "movq "#a", "#t" \n\t"\
2771 "psrlq $32, "#a" \n\t"\
2772 "paddd "#t", "#a" \n\t"
2774 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2775 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2776 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2778 #define PMULHRW(x, y, s, o)\
2779 "pmulhw " #s ", "#x " \n\t"\
2780 "pmulhw " #s ", "#y " \n\t"\
2781 "paddw " #o ", "#x " \n\t"\
2782 "paddw " #o ", "#y " \n\t"\
2783 "psraw $1, "#x " \n\t"\
2784 "psraw $1, "#y " \n\t"
2785 #define DEF(x) x ## _mmx
2786 #define SET_RND MOVQ_WONE
2787 #define SCALE_OFFSET 1
2789 #include "dsputil_mmx_qns.h"
2796 #define DEF(x) x ## _3dnow
2798 #define SCALE_OFFSET 0
2799 #define PMULHRW(x, y, s, o)\
2800 "pmulhrw " #s ", "#x " \n\t"\
2801 "pmulhrw " #s ", "#y " \n\t"
2803 #include "dsputil_mmx_qns.h"
2812 #define DEF(x) x ## _ssse3
2814 #define SCALE_OFFSET -1
2815 #define PHADDD(a, t)\
2816 "pshufw $0x0E, "#a", "#t" \n\t"\
2817 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2818 #define PMULHRW(x, y, s, o)\
2819 "pmulhrsw " #s ", "#x " \n\t"\
2820 "pmulhrsw " #s ", "#y " \n\t"
2822 #include "dsputil_mmx_qns.h"
2831 #endif /* CONFIG_ENCODERS */
2833 #define PREFETCH(name, op) \
2834 static void name(void *mem, int stride, int h){\
2835 const uint8_t *p= mem;\
2837 asm volatile(#op" %0" :: "m"(*p));\
2841 PREFETCH(prefetch_mmx2, prefetcht0)
2842 PREFETCH(prefetch_3dnow, prefetch)
2845 #include "h264dsp_mmx.c"
2848 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2850 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2851 put_pixels8_mmx(dst, src, stride, 8);
2853 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2854 avg_pixels8_mmx(dst, src, stride, 8);
2856 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2857 put_pixels16_mmx(dst, src, stride, 16);
2859 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2860 avg_pixels16_mmx(dst, src, stride, 16);
2864 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2866 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2867 put_pixels8_mmx(dst, src, stride, 8);
2870 /* external functions, from idct_mmx.c */
2871 void ff_mmx_idct(DCTELEM *block);
2872 void ff_mmxext_idct(DCTELEM *block);
2874 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2877 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2879 ff_mmx_idct (block);
2880 put_pixels_clamped_mmx(block, dest, line_size);
2882 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2884 ff_mmx_idct (block);
2885 add_pixels_clamped_mmx(block, dest, line_size);
2887 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2889 ff_mmxext_idct (block);
2890 put_pixels_clamped_mmx(block, dest, line_size);
2892 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2894 ff_mmxext_idct (block);
2895 add_pixels_clamped_mmx(block, dest, line_size);
2898 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2900 ff_idct_xvid_mmx (block);
2901 put_pixels_clamped_mmx(block, dest, line_size);
2903 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2905 ff_idct_xvid_mmx (block);
2906 add_pixels_clamped_mmx(block, dest, line_size);
2908 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2910 ff_idct_xvid_mmx2 (block);
2911 put_pixels_clamped_mmx(block, dest, line_size);
2913 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2915 ff_idct_xvid_mmx2 (block);
2916 add_pixels_clamped_mmx(block, dest, line_size);
2919 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2922 asm volatile("pxor %%mm7, %%mm7":);
2923 for(i=0; i<blocksize; i+=2) {
2925 "movq %0, %%mm0 \n\t"
2926 "movq %1, %%mm1 \n\t"
2927 "movq %%mm0, %%mm2 \n\t"
2928 "movq %%mm1, %%mm3 \n\t"
2929 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2930 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2931 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2932 "pxor %%mm2, %%mm1 \n\t"
2933 "movq %%mm3, %%mm4 \n\t"
2934 "pand %%mm1, %%mm3 \n\t"
2935 "pandn %%mm1, %%mm4 \n\t"
2936 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2937 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2938 "movq %%mm3, %1 \n\t"
2939 "movq %%mm0, %0 \n\t"
2940 :"+m"(mag[i]), "+m"(ang[i])
2944 asm volatile("femms");
2946 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2951 "movaps %0, %%xmm5 \n\t"
2952 ::"m"(ff_pdw_80000000[0])
2954 for(i=0; i<blocksize; i+=4) {
2956 "movaps %0, %%xmm0 \n\t"
2957 "movaps %1, %%xmm1 \n\t"
2958 "xorps %%xmm2, %%xmm2 \n\t"
2959 "xorps %%xmm3, %%xmm3 \n\t"
2960 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2961 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2962 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2963 "xorps %%xmm2, %%xmm1 \n\t"
2964 "movaps %%xmm3, %%xmm4 \n\t"
2965 "andps %%xmm1, %%xmm3 \n\t"
2966 "andnps %%xmm1, %%xmm4 \n\t"
2967 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2968 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2969 "movaps %%xmm3, %1 \n\t"
2970 "movaps %%xmm0, %0 \n\t"
2971 :"+m"(mag[i]), "+m"(ang[i])
2977 #ifdef CONFIG_ENCODERS
2978 static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
2980 double c = 2.0 / (len-1.0);
2982 long i = -n2*sizeof(int32_t);
2983 long j = n2*sizeof(int32_t);
2985 "movsd %0, %%xmm7 \n\t"
2986 "movapd %1, %%xmm6 \n\t"
2987 "movapd %2, %%xmm5 \n\t"
2988 "movlhps %%xmm7, %%xmm7 \n\t"
2989 "subpd %%xmm5, %%xmm7 \n\t"
2990 "addsd %%xmm6, %%xmm7 \n\t"
2991 ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2)
2993 #define WELCH(MOVPD)\
2996 "movapd %%xmm7, %%xmm1 \n\t"\
2997 "mulpd %%xmm1, %%xmm1 \n\t"\
2998 "movapd %%xmm6, %%xmm0 \n\t"\
2999 "subpd %%xmm1, %%xmm0 \n\t"\
3000 "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
3001 "cvtpi2pd (%4,%0), %%xmm2 \n\t"\
3002 "cvtpi2pd (%5,%1), %%xmm3 \n\t"\
3003 "mulpd %%xmm0, %%xmm2 \n\t"\
3004 "mulpd %%xmm1, %%xmm3 \n\t"\
3005 "movapd %%xmm2, (%2,%0,2) \n\t"\
3006 MOVPD" %%xmm3, (%3,%1,2) \n\t"\
3007 "subpd %%xmm5, %%xmm7 \n\t"\
3011 :"+&r"(i), "+&r"(j)\
3012 :"r"(w_data+n2), "r"(w_data+len-2-n2),\
3013 "r"(data+n2), "r"(data+len-2-n2)\
3022 static void flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
3025 double tmp[len + lag + 2];
3026 double *data1 = tmp + lag;
3029 if((long)data1 & 15)
3032 apply_welch_window_sse2(data, len, data1);
3034 for(j=0; j<lag; j++)
3038 for(j=0; j<lag; j+=2){
3039 long i = -len*sizeof(double);
3042 "movsd %6, %%xmm0 \n\t"
3043 "movsd %6, %%xmm1 \n\t"
3044 "movsd %6, %%xmm2 \n\t"
3046 "movapd (%4,%0), %%xmm3 \n\t"
3047 "movupd -8(%5,%0), %%xmm4 \n\t"
3048 "movapd (%5,%0), %%xmm5 \n\t"
3049 "mulpd %%xmm3, %%xmm4 \n\t"
3050 "mulpd %%xmm3, %%xmm5 \n\t"
3051 "mulpd -16(%5,%0), %%xmm3 \n\t"
3052 "addpd %%xmm4, %%xmm1 \n\t"
3053 "addpd %%xmm5, %%xmm0 \n\t"
3054 "addpd %%xmm3, %%xmm2 \n\t"
3057 "movhlps %%xmm0, %%xmm3 \n\t"
3058 "movhlps %%xmm1, %%xmm4 \n\t"
3059 "movhlps %%xmm2, %%xmm5 \n\t"
3060 "addsd %%xmm3, %%xmm0 \n\t"
3061 "addsd %%xmm4, %%xmm1 \n\t"
3062 "addsd %%xmm5, %%xmm2 \n\t"
3063 "movsd %%xmm0, %1 \n\t"
3064 "movsd %%xmm1, %2 \n\t"
3065 "movsd %%xmm2, %3 \n\t"
3066 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
3067 :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
3071 "movsd %5, %%xmm0 \n\t"
3072 "movsd %5, %%xmm1 \n\t"
3074 "movapd (%3,%0), %%xmm3 \n\t"
3075 "movupd -8(%4,%0), %%xmm4 \n\t"
3076 "mulpd %%xmm3, %%xmm4 \n\t"
3077 "mulpd (%4,%0), %%xmm3 \n\t"
3078 "addpd %%xmm4, %%xmm1 \n\t"
3079 "addpd %%xmm3, %%xmm0 \n\t"
3082 "movhlps %%xmm0, %%xmm3 \n\t"
3083 "movhlps %%xmm1, %%xmm4 \n\t"
3084 "addsd %%xmm3, %%xmm0 \n\t"
3085 "addsd %%xmm4, %%xmm1 \n\t"
3086 "movsd %%xmm0, %1 \n\t"
3087 "movsd %%xmm1, %2 \n\t"
3088 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
3089 :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
3094 #endif // CONFIG_ENCODERS
3096 static void vector_fmul_3dnow(float *dst, const float *src, int len){
3100 "movq (%1,%0), %%mm0 \n\t"
3101 "movq 8(%1,%0), %%mm1 \n\t"
3102 "pfmul (%2,%0), %%mm0 \n\t"
3103 "pfmul 8(%2,%0), %%mm1 \n\t"
3104 "movq %%mm0, (%1,%0) \n\t"
3105 "movq %%mm1, 8(%1,%0) \n\t"
3114 static void vector_fmul_sse(float *dst, const float *src, int len){
3118 "movaps (%1,%0), %%xmm0 \n\t"
3119 "movaps 16(%1,%0), %%xmm1 \n\t"
3120 "mulps (%2,%0), %%xmm0 \n\t"
3121 "mulps 16(%2,%0), %%xmm1 \n\t"
3122 "movaps %%xmm0, (%1,%0) \n\t"
3123 "movaps %%xmm1, 16(%1,%0) \n\t"
3132 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3136 "pswapd 8(%1), %%mm0 \n\t"
3137 "pswapd (%1), %%mm1 \n\t"
3138 "pfmul (%3,%0), %%mm0 \n\t"
3139 "pfmul 8(%3,%0), %%mm1 \n\t"
3140 "movq %%mm0, (%2,%0) \n\t"
3141 "movq %%mm1, 8(%2,%0) \n\t"
3145 :"+r"(i), "+r"(src1)
3146 :"r"(dst), "r"(src0)
3148 asm volatile("femms");
3150 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3154 "movaps 16(%1), %%xmm0 \n\t"
3155 "movaps (%1), %%xmm1 \n\t"
3156 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3157 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3158 "mulps (%3,%0), %%xmm0 \n\t"
3159 "mulps 16(%3,%0), %%xmm1 \n\t"
3160 "movaps %%xmm0, (%2,%0) \n\t"
3161 "movaps %%xmm1, 16(%2,%0) \n\t"
3165 :"+r"(i), "+r"(src1)
3166 :"r"(dst), "r"(src0)
3170 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3171 const float *src2, int src3, int len, int step){
3173 if(step == 2 && src3 == 0){
3177 "movq (%2,%0), %%mm0 \n\t"
3178 "movq 8(%2,%0), %%mm1 \n\t"
3179 "pfmul (%3,%0), %%mm0 \n\t"
3180 "pfmul 8(%3,%0), %%mm1 \n\t"
3181 "pfadd (%4,%0), %%mm0 \n\t"
3182 "pfadd 8(%4,%0), %%mm1 \n\t"
3183 "movd %%mm0, (%1) \n\t"
3184 "movd %%mm1, 16(%1) \n\t"
3185 "psrlq $32, %%mm0 \n\t"
3186 "psrlq $32, %%mm1 \n\t"
3187 "movd %%mm0, 8(%1) \n\t"
3188 "movd %%mm1, 24(%1) \n\t"
3193 :"r"(src0), "r"(src1), "r"(src2)
3197 else if(step == 1 && src3 == 0){
3200 "movq (%2,%0), %%mm0 \n\t"
3201 "movq 8(%2,%0), %%mm1 \n\t"
3202 "pfmul (%3,%0), %%mm0 \n\t"
3203 "pfmul 8(%3,%0), %%mm1 \n\t"
3204 "pfadd (%4,%0), %%mm0 \n\t"
3205 "pfadd 8(%4,%0), %%mm1 \n\t"
3206 "movq %%mm0, (%1,%0) \n\t"
3207 "movq %%mm1, 8(%1,%0) \n\t"
3211 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3216 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3217 asm volatile("femms");
3219 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3220 const float *src2, int src3, int len, int step){
3222 if(step == 2 && src3 == 0){
3226 "movaps (%2,%0), %%xmm0 \n\t"
3227 "movaps 16(%2,%0), %%xmm1 \n\t"
3228 "mulps (%3,%0), %%xmm0 \n\t"
3229 "mulps 16(%3,%0), %%xmm1 \n\t"
3230 "addps (%4,%0), %%xmm0 \n\t"
3231 "addps 16(%4,%0), %%xmm1 \n\t"
3232 "movss %%xmm0, (%1) \n\t"
3233 "movss %%xmm1, 32(%1) \n\t"
3234 "movhlps %%xmm0, %%xmm2 \n\t"
3235 "movhlps %%xmm1, %%xmm3 \n\t"
3236 "movss %%xmm2, 16(%1) \n\t"
3237 "movss %%xmm3, 48(%1) \n\t"
3238 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3239 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3240 "movss %%xmm0, 8(%1) \n\t"
3241 "movss %%xmm1, 40(%1) \n\t"
3242 "movhlps %%xmm0, %%xmm2 \n\t"
3243 "movhlps %%xmm1, %%xmm3 \n\t"
3244 "movss %%xmm2, 24(%1) \n\t"
3245 "movss %%xmm3, 56(%1) \n\t"
3250 :"r"(src0), "r"(src1), "r"(src2)
3254 else if(step == 1 && src3 == 0){
3257 "movaps (%2,%0), %%xmm0 \n\t"
3258 "movaps 16(%2,%0), %%xmm1 \n\t"
3259 "mulps (%3,%0), %%xmm0 \n\t"
3260 "mulps 16(%3,%0), %%xmm1 \n\t"
3261 "addps (%4,%0), %%xmm0 \n\t"
3262 "addps 16(%4,%0), %%xmm1 \n\t"
3263 "movaps %%xmm0, (%1,%0) \n\t"
3264 "movaps %%xmm1, 16(%1,%0) \n\t"
3268 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3273 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3276 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3277 // not bit-exact: pf2id uses different rounding than C and SSE
3279 for(i=0; i<len; i+=4) {
3281 "pf2id %1, %%mm0 \n\t"
3282 "pf2id %2, %%mm1 \n\t"
3283 "packssdw %%mm1, %%mm0 \n\t"
3284 "movq %%mm0, %0 \n\t"
3286 :"m"(src[i]), "m"(src[i+2])
3289 asm volatile("femms");
3291 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3293 for(i=0; i<len; i+=4) {
3295 "cvtps2pi %1, %%mm0 \n\t"
3296 "cvtps2pi %2, %%mm1 \n\t"
3297 "packssdw %%mm1, %%mm0 \n\t"
3298 "movq %%mm0, %0 \n\t"
3300 :"m"(src[i]), "m"(src[i+2])
3303 asm volatile("emms");
3306 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3307 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3308 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3309 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3310 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3311 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3312 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3313 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3315 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3317 mm_flags = mm_support();
3319 if (avctx->dsp_mask) {
3320 if (avctx->dsp_mask & FF_MM_FORCE)
3321 mm_flags |= (avctx->dsp_mask & 0xffff);
3323 mm_flags &= ~(avctx->dsp_mask & 0xffff);
3327 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3328 if (mm_flags & MM_MMX)
3329 av_log(avctx, AV_LOG_INFO, " mmx");
3330 if (mm_flags & MM_MMXEXT)
3331 av_log(avctx, AV_LOG_INFO, " mmxext");
3332 if (mm_flags & MM_3DNOW)
3333 av_log(avctx, AV_LOG_INFO, " 3dnow");
3334 if (mm_flags & MM_SSE)
3335 av_log(avctx, AV_LOG_INFO, " sse");
3336 if (mm_flags & MM_SSE2)
3337 av_log(avctx, AV_LOG_INFO, " sse2");
3338 av_log(avctx, AV_LOG_INFO, "\n");
3341 if (mm_flags & MM_MMX) {
3342 const int idct_algo= avctx->idct_algo;
3344 #ifdef CONFIG_ENCODERS
3345 const int dct_algo = avctx->dct_algo;
3346 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3347 if(mm_flags & MM_SSE2){
3348 c->fdct = ff_fdct_sse2;
3349 }else if(mm_flags & MM_MMXEXT){
3350 c->fdct = ff_fdct_mmx2;
3352 c->fdct = ff_fdct_mmx;
3355 #endif //CONFIG_ENCODERS
3356 if(avctx->lowres==0){
3357 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3358 c->idct_put= ff_simple_idct_put_mmx;
3359 c->idct_add= ff_simple_idct_add_mmx;
3360 c->idct = ff_simple_idct_mmx;
3361 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3363 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3364 if(mm_flags & MM_MMXEXT){
3365 c->idct_put= ff_libmpeg2mmx2_idct_put;
3366 c->idct_add= ff_libmpeg2mmx2_idct_add;
3367 c->idct = ff_mmxext_idct;
3369 c->idct_put= ff_libmpeg2mmx_idct_put;
3370 c->idct_add= ff_libmpeg2mmx_idct_add;
3371 c->idct = ff_mmx_idct;
3373 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3375 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3376 idct_algo==FF_IDCT_VP3 &&
3377 avctx->codec->id!=CODEC_ID_THEORA &&
3378 !(avctx->flags & CODEC_FLAG_BITEXACT)){
3379 if(mm_flags & MM_SSE2){
3380 c->idct_put= ff_vp3_idct_put_sse2;
3381 c->idct_add= ff_vp3_idct_add_sse2;
3382 c->idct = ff_vp3_idct_sse2;
3383 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3385 ff_vp3_dsp_init_mmx();
3386 c->idct_put= ff_vp3_idct_put_mmx;
3387 c->idct_add= ff_vp3_idct_add_mmx;
3388 c->idct = ff_vp3_idct_mmx;
3389 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3391 }else if(idct_algo==FF_IDCT_CAVS){
3392 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3393 }else if(idct_algo==FF_IDCT_XVIDMMX){
3394 if(mm_flags & MM_MMXEXT){
3395 c->idct_put= ff_idct_xvid_mmx2_put;
3396 c->idct_add= ff_idct_xvid_mmx2_add;
3397 c->idct = ff_idct_xvid_mmx2;
3399 c->idct_put= ff_idct_xvid_mmx_put;
3400 c->idct_add= ff_idct_xvid_mmx_add;
3401 c->idct = ff_idct_xvid_mmx;
3406 #ifdef CONFIG_ENCODERS
3407 c->get_pixels = get_pixels_mmx;
3408 c->diff_pixels = diff_pixels_mmx;
3409 #endif //CONFIG_ENCODERS
3410 c->put_pixels_clamped = put_pixels_clamped_mmx;
3411 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3412 c->add_pixels_clamped = add_pixels_clamped_mmx;
3413 c->clear_blocks = clear_blocks_mmx;
3414 #ifdef CONFIG_ENCODERS
3415 c->pix_sum = pix_sum16_mmx;
3416 #endif //CONFIG_ENCODERS
3418 c->put_pixels_tab[0][0] = put_pixels16_mmx;
3419 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
3420 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
3421 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
3423 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
3424 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
3425 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
3426 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
3428 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
3429 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
3430 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
3431 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
3433 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
3434 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
3435 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
3436 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
3438 c->put_pixels_tab[1][0] = put_pixels8_mmx;
3439 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
3440 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
3441 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
3443 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
3444 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
3445 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
3446 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
3448 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
3449 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
3450 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
3451 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
3453 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
3454 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
3455 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
3456 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
3460 c->add_bytes= add_bytes_mmx;
3461 #ifdef CONFIG_ENCODERS
3462 c->diff_bytes= diff_bytes_mmx;
3463 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
3465 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
3466 c->hadamard8_diff[1]= hadamard8_diff_mmx;
3468 c->pix_norm1 = pix_norm1_mmx;
3469 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
3470 c->sse[1] = sse8_mmx;
3471 c->vsad[4]= vsad_intra16_mmx;
3473 c->nsse[0] = nsse16_mmx;
3474 c->nsse[1] = nsse8_mmx;
3475 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3476 c->vsad[0] = vsad16_mmx;
3479 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3480 c->try_8x8basis= try_8x8basis_mmx;
3482 c->add_8x8basis= add_8x8basis_mmx;
3484 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
3486 #endif //CONFIG_ENCODERS
3488 if (ENABLE_ANY_H263) {
3489 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
3490 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
3492 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
3493 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
3495 c->h264_idct_dc_add=
3496 c->h264_idct_add= ff_h264_idct_add_mmx;
3497 c->h264_idct8_dc_add=
3498 c->h264_idct8_add= ff_h264_idct8_add_mmx;
3500 if (mm_flags & MM_MMXEXT) {
3501 c->prefetch = prefetch_mmx2;
3503 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
3504 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
3506 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
3507 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
3508 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
3510 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
3511 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
3513 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
3514 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
3515 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
3517 #ifdef CONFIG_ENCODERS
3518 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
3519 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
3520 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
3521 c->vsad[4]= vsad_intra16_mmx2;
3522 #endif //CONFIG_ENCODERS
3524 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
3525 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
3527 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3528 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
3529 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
3530 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
3531 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
3532 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
3533 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3534 #ifdef CONFIG_ENCODERS
3535 c->vsad[0] = vsad16_mmx2;
3536 #endif //CONFIG_ENCODERS
3540 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
3541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
3542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
3543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
3544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
3545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
3546 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
3547 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
3548 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
3549 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
3550 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
3551 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
3552 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
3553 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
3554 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
3555 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
3556 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
3557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
3558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
3559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
3560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
3561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
3562 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
3563 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
3564 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
3565 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
3566 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
3567 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
3568 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
3569 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
3570 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
3571 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
3575 #define dspfunc(PFX, IDX, NUM) \
3576 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
3577 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
3578 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
3579 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
3580 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
3581 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
3582 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
3583 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
3584 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
3585 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
3586 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
3587 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
3588 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
3589 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
3590 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
3591 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
3593 dspfunc(put_h264_qpel, 0, 16);
3594 dspfunc(put_h264_qpel, 1, 8);
3595 dspfunc(put_h264_qpel, 2, 4);
3596 dspfunc(avg_h264_qpel, 0, 16);
3597 dspfunc(avg_h264_qpel, 1, 8);
3598 dspfunc(avg_h264_qpel, 2, 4);
3600 dspfunc(put_2tap_qpel, 0, 16);
3601 dspfunc(put_2tap_qpel, 1, 8);
3602 dspfunc(avg_2tap_qpel, 0, 16);
3603 dspfunc(avg_2tap_qpel, 1, 8);
3606 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
3607 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3608 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
3609 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
3610 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
3611 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
3612 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
3613 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
3614 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
3615 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3616 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
3618 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3619 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3620 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3621 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3622 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3623 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3624 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3625 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3627 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3628 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3629 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3630 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3631 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3632 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3633 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3634 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3636 if (ENABLE_CAVS_DECODER)
3637 ff_cavsdsp_init_mmx2(c, avctx);
3639 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
3640 ff_vc1dsp_init_mmx(c, avctx);
3642 #ifdef CONFIG_ENCODERS
3643 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3644 #endif //CONFIG_ENCODERS
3645 } else if (mm_flags & MM_3DNOW) {
3646 c->prefetch = prefetch_3dnow;
3648 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
3649 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
3651 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
3652 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
3653 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
3655 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
3656 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
3658 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
3659 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
3660 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
3662 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3663 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
3664 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
3665 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
3666 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
3667 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
3668 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
3671 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
3672 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
3673 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
3674 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
3675 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
3676 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
3677 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
3678 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
3679 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
3680 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
3681 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
3682 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
3683 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
3684 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
3685 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
3686 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
3687 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
3688 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
3689 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
3690 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
3691 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
3692 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
3693 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
3694 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
3695 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
3696 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
3697 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
3698 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
3699 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
3700 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
3701 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
3702 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
3704 #define dspfunc(PFX, IDX, NUM) \
3705 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
3706 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
3707 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
3708 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
3709 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
3710 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
3711 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
3712 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
3713 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
3714 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
3715 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
3716 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
3717 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
3718 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
3719 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
3720 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
3722 dspfunc(put_h264_qpel, 0, 16);
3723 dspfunc(put_h264_qpel, 1, 8);
3724 dspfunc(put_h264_qpel, 2, 4);
3725 dspfunc(avg_h264_qpel, 0, 16);
3726 dspfunc(avg_h264_qpel, 1, 8);
3727 dspfunc(avg_h264_qpel, 2, 4);
3729 dspfunc(put_2tap_qpel, 0, 16);
3730 dspfunc(put_2tap_qpel, 1, 8);
3731 dspfunc(avg_2tap_qpel, 0, 16);
3732 dspfunc(avg_2tap_qpel, 1, 8);
3734 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
3735 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
3738 #ifdef CONFIG_ENCODERS
3739 if(mm_flags & MM_SSE2){
3740 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3741 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3742 c->hadamard8_diff[1]= hadamard8_diff_sse2;
3743 c->flac_compute_autocorr = flac_compute_autocorr_sse2;
3747 if(mm_flags & MM_SSSE3){
3748 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3749 c->try_8x8basis= try_8x8basis_ssse3;
3751 c->add_8x8basis= add_8x8basis_ssse3;
3752 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
3753 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
3754 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
3759 #ifdef CONFIG_SNOW_DECODER
3760 if(mm_flags & MM_SSE2 & 0){
3761 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3763 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3765 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3768 if(mm_flags & MM_MMXEXT){
3769 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3771 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3774 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3778 if(mm_flags & MM_3DNOW){
3779 #ifdef CONFIG_ENCODERS
3780 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3781 c->try_8x8basis= try_8x8basis_3dnow;
3783 c->add_8x8basis= add_8x8basis_3dnow;
3784 #endif //CONFIG_ENCODERS
3785 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
3786 c->vector_fmul = vector_fmul_3dnow;
3787 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
3788 c->float_to_int16 = float_to_int16_3dnow;
3790 if(mm_flags & MM_3DNOWEXT)
3791 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
3792 if(mm_flags & MM_SSE){
3793 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3794 c->vector_fmul = vector_fmul_sse;
3795 c->float_to_int16 = float_to_int16_sse;
3796 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3797 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3799 if(mm_flags & MM_3DNOW)
3800 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
3803 #ifdef CONFIG_ENCODERS
3804 dsputil_init_pix_mmx(c, avctx);
3805 #endif //CONFIG_ENCODERS
3807 // for speed testing
3808 get_pixels = just_return;
3809 put_pixels_clamped = just_return;
3810 add_pixels_clamped = just_return;
3812 pix_abs16x16 = just_return;
3813 pix_abs16x16_x2 = just_return;
3814 pix_abs16x16_y2 = just_return;
3815 pix_abs16x16_xy2 = just_return;
3817 put_pixels_tab[0] = just_return;
3818 put_pixels_tab[1] = just_return;
3819 put_pixels_tab[2] = just_return;
3820 put_pixels_tab[3] = just_return;
3822 put_no_rnd_pixels_tab[0] = just_return;
3823 put_no_rnd_pixels_tab[1] = just_return;
3824 put_no_rnd_pixels_tab[2] = just_return;
3825 put_no_rnd_pixels_tab[3] = just_return;
3827 avg_pixels_tab[0] = just_return;
3828 avg_pixels_tab[1] = just_return;
3829 avg_pixels_tab[2] = just_return;
3830 avg_pixels_tab[3] = just_return;
3832 avg_no_rnd_pixels_tab[0] = just_return;
3833 avg_no_rnd_pixels_tab[1] = just_return;
3834 avg_no_rnd_pixels_tab[2] = just_return;
3835 avg_no_rnd_pixels_tab[3] = just_return;
3837 //av_fdct = just_return;
3838 //ff_idct = just_return;