2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "simple_idct.h"
27 #include "mpegvideo.h"
30 #include "vp3dsp_mmx.h"
31 #include "vp3dsp_sse2.h"
37 extern void ff_idct_xvid_mmx(short *block);
38 extern void ff_idct_xvid_mmx2(short *block);
40 int mm_flags; /* multimedia extension flags */
42 /* pixel operations */
43 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
44 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
45 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
47 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
50 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
51 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
52 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
53 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
54 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
55 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
56 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
57 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
58 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
61 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
62 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
63 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
64 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
65 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
66 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
68 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
69 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
71 #define MOVQ_WONE(regd) \
73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
74 "psrlw $15, %%" #regd ::)
76 #define MOVQ_BFE(regd) \
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
79 "paddb %%" #regd ", %%" #regd " \n\t" ::)
82 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
83 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
85 // for shared library it's better to use this way for accessing constants
87 #define MOVQ_BONE(regd) \
89 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
90 "psrlw $15, %%" #regd " \n\t" \
91 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
93 #define MOVQ_WTWO(regd) \
95 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
96 "psrlw $15, %%" #regd " \n\t" \
97 "psllw $1, %%" #regd " \n\t"::)
101 // using regr as temporary and for the output result
102 // first argument is unmodifed and second is trashed
103 // regfe is supposed to contain 0xfefefefefefefefe
104 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
105 "movq " #rega ", " #regr " \n\t"\
106 "pand " #regb ", " #regr " \n\t"\
107 "pxor " #rega ", " #regb " \n\t"\
108 "pand " #regfe "," #regb " \n\t"\
109 "psrlq $1, " #regb " \n\t"\
110 "paddb " #regb ", " #regr " \n\t"
112 #define PAVGB_MMX(rega, regb, regr, regfe) \
113 "movq " #rega ", " #regr " \n\t"\
114 "por " #regb ", " #regr " \n\t"\
115 "pxor " #rega ", " #regb " \n\t"\
116 "pand " #regfe "," #regb " \n\t"\
117 "psrlq $1, " #regb " \n\t"\
118 "psubb " #regb ", " #regr " \n\t"
120 // mm6 is supposed to contain 0xfefefefefefefefe
121 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
122 "movq " #rega ", " #regr " \n\t"\
123 "movq " #regc ", " #regp " \n\t"\
124 "pand " #regb ", " #regr " \n\t"\
125 "pand " #regd ", " #regp " \n\t"\
126 "pxor " #rega ", " #regb " \n\t"\
127 "pxor " #regc ", " #regd " \n\t"\
128 "pand %%mm6, " #regb " \n\t"\
129 "pand %%mm6, " #regd " \n\t"\
130 "psrlq $1, " #regb " \n\t"\
131 "psrlq $1, " #regd " \n\t"\
132 "paddb " #regb ", " #regr " \n\t"\
133 "paddb " #regd ", " #regp " \n\t"
135 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
136 "movq " #rega ", " #regr " \n\t"\
137 "movq " #regc ", " #regp " \n\t"\
138 "por " #regb ", " #regr " \n\t"\
139 "por " #regd ", " #regp " \n\t"\
140 "pxor " #rega ", " #regb " \n\t"\
141 "pxor " #regc ", " #regd " \n\t"\
142 "pand %%mm6, " #regb " \n\t"\
143 "pand %%mm6, " #regd " \n\t"\
144 "psrlq $1, " #regd " \n\t"\
145 "psrlq $1, " #regb " \n\t"\
146 "psubb " #regb ", " #regr " \n\t"\
147 "psubb " #regd ", " #regp " \n\t"
149 /***********************************/
150 /* MMX no rounding */
151 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
152 #define SET_RND MOVQ_WONE
153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
154 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
156 #include "dsputil_mmx_rnd.h"
162 /***********************************/
165 #define DEF(x, y) x ## _ ## y ##_mmx
166 #define SET_RND MOVQ_WTWO
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
170 #include "dsputil_mmx_rnd.h"
177 /***********************************/
180 #define DEF(x) x ## _3dnow
181 #define PAVGB "pavgusb"
183 #include "dsputil_mmx_avg.h"
188 /***********************************/
191 #define DEF(x) x ## _mmx2
193 /* Introduced only in MMX2 set */
194 #define PAVGB "pavgb"
196 #include "dsputil_mmx_avg.h"
201 #define SBUTTERFLY(a,b,t,n,m)\
202 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
203 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
204 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
206 #define TRANSPOSE4(a,b,c,d,t)\
207 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
208 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
209 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
210 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
212 /***********************************/
215 #ifdef CONFIG_ENCODERS
216 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
219 "mov $-128, %%"REG_a" \n\t"
220 "pxor %%mm7, %%mm7 \n\t"
223 "movq (%0), %%mm0 \n\t"
224 "movq (%0, %2), %%mm2 \n\t"
225 "movq %%mm0, %%mm1 \n\t"
226 "movq %%mm2, %%mm3 \n\t"
227 "punpcklbw %%mm7, %%mm0 \n\t"
228 "punpckhbw %%mm7, %%mm1 \n\t"
229 "punpcklbw %%mm7, %%mm2 \n\t"
230 "punpckhbw %%mm7, %%mm3 \n\t"
231 "movq %%mm0, (%1, %%"REG_a") \n\t"
232 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
233 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
234 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
236 "add $32, %%"REG_a" \n\t"
239 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
244 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
247 "pxor %%mm7, %%mm7 \n\t"
248 "mov $-128, %%"REG_a" \n\t"
251 "movq (%0), %%mm0 \n\t"
252 "movq (%1), %%mm2 \n\t"
253 "movq %%mm0, %%mm1 \n\t"
254 "movq %%mm2, %%mm3 \n\t"
255 "punpcklbw %%mm7, %%mm0 \n\t"
256 "punpckhbw %%mm7, %%mm1 \n\t"
257 "punpcklbw %%mm7, %%mm2 \n\t"
258 "punpckhbw %%mm7, %%mm3 \n\t"
259 "psubw %%mm2, %%mm0 \n\t"
260 "psubw %%mm3, %%mm1 \n\t"
261 "movq %%mm0, (%2, %%"REG_a") \n\t"
262 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
265 "add $16, %%"REG_a" \n\t"
267 : "+r" (s1), "+r" (s2)
268 : "r" (block+64), "r" ((long)stride)
272 #endif //CONFIG_ENCODERS
274 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
279 /* read the pixels */
284 "movq %3, %%mm0 \n\t"
285 "movq 8%3, %%mm1 \n\t"
286 "movq 16%3, %%mm2 \n\t"
287 "movq 24%3, %%mm3 \n\t"
288 "movq 32%3, %%mm4 \n\t"
289 "movq 40%3, %%mm5 \n\t"
290 "movq 48%3, %%mm6 \n\t"
291 "movq 56%3, %%mm7 \n\t"
292 "packuswb %%mm1, %%mm0 \n\t"
293 "packuswb %%mm3, %%mm2 \n\t"
294 "packuswb %%mm5, %%mm4 \n\t"
295 "packuswb %%mm7, %%mm6 \n\t"
296 "movq %%mm0, (%0) \n\t"
297 "movq %%mm2, (%0, %1) \n\t"
298 "movq %%mm4, (%0, %1, 2) \n\t"
299 "movq %%mm6, (%0, %2) \n\t"
300 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
305 // if here would be an exact copy of the code above
306 // compiler would generate some very strange code
309 "movq (%3), %%mm0 \n\t"
310 "movq 8(%3), %%mm1 \n\t"
311 "movq 16(%3), %%mm2 \n\t"
312 "movq 24(%3), %%mm3 \n\t"
313 "movq 32(%3), %%mm4 \n\t"
314 "movq 40(%3), %%mm5 \n\t"
315 "movq 48(%3), %%mm6 \n\t"
316 "movq 56(%3), %%mm7 \n\t"
317 "packuswb %%mm1, %%mm0 \n\t"
318 "packuswb %%mm3, %%mm2 \n\t"
319 "packuswb %%mm5, %%mm4 \n\t"
320 "packuswb %%mm7, %%mm6 \n\t"
321 "movq %%mm0, (%0) \n\t"
322 "movq %%mm2, (%0, %1) \n\t"
323 "movq %%mm4, (%0, %1, 2) \n\t"
324 "movq %%mm6, (%0, %2) \n\t"
325 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
329 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
330 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
332 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
336 movq_m2r(*vector128, mm1);
337 for (i = 0; i < 8; i++) {
338 movq_m2r(*(block), mm0);
339 packsswb_m2r(*(block + 4), mm0);
342 movq_r2m(mm0, *pixels);
347 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
353 /* read the pixels */
360 "movq (%2), %%mm0 \n\t"
361 "movq 8(%2), %%mm1 \n\t"
362 "movq 16(%2), %%mm2 \n\t"
363 "movq 24(%2), %%mm3 \n\t"
364 "movq %0, %%mm4 \n\t"
365 "movq %1, %%mm6 \n\t"
366 "movq %%mm4, %%mm5 \n\t"
367 "punpcklbw %%mm7, %%mm4 \n\t"
368 "punpckhbw %%mm7, %%mm5 \n\t"
369 "paddsw %%mm4, %%mm0 \n\t"
370 "paddsw %%mm5, %%mm1 \n\t"
371 "movq %%mm6, %%mm5 \n\t"
372 "punpcklbw %%mm7, %%mm6 \n\t"
373 "punpckhbw %%mm7, %%mm5 \n\t"
374 "paddsw %%mm6, %%mm2 \n\t"
375 "paddsw %%mm5, %%mm3 \n\t"
376 "packuswb %%mm1, %%mm0 \n\t"
377 "packuswb %%mm3, %%mm2 \n\t"
378 "movq %%mm0, %0 \n\t"
379 "movq %%mm2, %1 \n\t"
380 :"+m"(*pix), "+m"(*(pix+line_size))
388 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
391 "lea (%3, %3), %%"REG_a" \n\t"
394 "movd (%1), %%mm0 \n\t"
395 "movd (%1, %3), %%mm1 \n\t"
396 "movd %%mm0, (%2) \n\t"
397 "movd %%mm1, (%2, %3) \n\t"
398 "add %%"REG_a", %1 \n\t"
399 "add %%"REG_a", %2 \n\t"
400 "movd (%1), %%mm0 \n\t"
401 "movd (%1, %3), %%mm1 \n\t"
402 "movd %%mm0, (%2) \n\t"
403 "movd %%mm1, (%2, %3) \n\t"
404 "add %%"REG_a", %1 \n\t"
405 "add %%"REG_a", %2 \n\t"
408 : "+g"(h), "+r" (pixels), "+r" (block)
409 : "r"((long)line_size)
414 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
417 "lea (%3, %3), %%"REG_a" \n\t"
420 "movq (%1), %%mm0 \n\t"
421 "movq (%1, %3), %%mm1 \n\t"
422 "movq %%mm0, (%2) \n\t"
423 "movq %%mm1, (%2, %3) \n\t"
424 "add %%"REG_a", %1 \n\t"
425 "add %%"REG_a", %2 \n\t"
426 "movq (%1), %%mm0 \n\t"
427 "movq (%1, %3), %%mm1 \n\t"
428 "movq %%mm0, (%2) \n\t"
429 "movq %%mm1, (%2, %3) \n\t"
430 "add %%"REG_a", %1 \n\t"
431 "add %%"REG_a", %2 \n\t"
434 : "+g"(h), "+r" (pixels), "+r" (block)
435 : "r"((long)line_size)
440 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
443 "lea (%3, %3), %%"REG_a" \n\t"
446 "movq (%1), %%mm0 \n\t"
447 "movq 8(%1), %%mm4 \n\t"
448 "movq (%1, %3), %%mm1 \n\t"
449 "movq 8(%1, %3), %%mm5 \n\t"
450 "movq %%mm0, (%2) \n\t"
451 "movq %%mm4, 8(%2) \n\t"
452 "movq %%mm1, (%2, %3) \n\t"
453 "movq %%mm5, 8(%2, %3) \n\t"
454 "add %%"REG_a", %1 \n\t"
455 "add %%"REG_a", %2 \n\t"
456 "movq (%1), %%mm0 \n\t"
457 "movq 8(%1), %%mm4 \n\t"
458 "movq (%1, %3), %%mm1 \n\t"
459 "movq 8(%1, %3), %%mm5 \n\t"
460 "movq %%mm0, (%2) \n\t"
461 "movq %%mm4, 8(%2) \n\t"
462 "movq %%mm1, (%2, %3) \n\t"
463 "movq %%mm5, 8(%2, %3) \n\t"
464 "add %%"REG_a", %1 \n\t"
465 "add %%"REG_a", %2 \n\t"
468 : "+g"(h), "+r" (pixels), "+r" (block)
469 : "r"((long)line_size)
474 static void clear_blocks_mmx(DCTELEM *blocks)
477 "pxor %%mm7, %%mm7 \n\t"
478 "mov $-128*6, %%"REG_a" \n\t"
480 "movq %%mm7, (%0, %%"REG_a") \n\t"
481 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
482 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
483 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
484 "add $32, %%"REG_a" \n\t"
486 : : "r" (((uint8_t *)blocks)+128*6)
491 #ifdef CONFIG_ENCODERS
492 static int pix_sum16_mmx(uint8_t * pix, int line_size){
495 long index= -line_size*h;
498 "pxor %%mm7, %%mm7 \n\t"
499 "pxor %%mm6, %%mm6 \n\t"
501 "movq (%2, %1), %%mm0 \n\t"
502 "movq (%2, %1), %%mm1 \n\t"
503 "movq 8(%2, %1), %%mm2 \n\t"
504 "movq 8(%2, %1), %%mm3 \n\t"
505 "punpcklbw %%mm7, %%mm0 \n\t"
506 "punpckhbw %%mm7, %%mm1 \n\t"
507 "punpcklbw %%mm7, %%mm2 \n\t"
508 "punpckhbw %%mm7, %%mm3 \n\t"
509 "paddw %%mm0, %%mm1 \n\t"
510 "paddw %%mm2, %%mm3 \n\t"
511 "paddw %%mm1, %%mm3 \n\t"
512 "paddw %%mm3, %%mm6 \n\t"
515 "movq %%mm6, %%mm5 \n\t"
516 "psrlq $32, %%mm6 \n\t"
517 "paddw %%mm5, %%mm6 \n\t"
518 "movq %%mm6, %%mm5 \n\t"
519 "psrlq $16, %%mm6 \n\t"
520 "paddw %%mm5, %%mm6 \n\t"
521 "movd %%mm6, %0 \n\t"
522 "andl $0xFFFF, %0 \n\t"
523 : "=&r" (sum), "+r" (index)
524 : "r" (pix - index), "r" ((long)line_size)
529 #endif //CONFIG_ENCODERS
531 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
535 "movq (%1, %0), %%mm0 \n\t"
536 "movq (%2, %0), %%mm1 \n\t"
537 "paddb %%mm0, %%mm1 \n\t"
538 "movq %%mm1, (%2, %0) \n\t"
539 "movq 8(%1, %0), %%mm0 \n\t"
540 "movq 8(%2, %0), %%mm1 \n\t"
541 "paddb %%mm0, %%mm1 \n\t"
542 "movq %%mm1, 8(%2, %0) \n\t"
547 : "r"(src), "r"(dst), "r"((long)w-15)
550 dst[i+0] += src[i+0];
553 #define H263_LOOP_FILTER \
554 "pxor %%mm7, %%mm7 \n\t"\
555 "movq %0, %%mm0 \n\t"\
556 "movq %0, %%mm1 \n\t"\
557 "movq %3, %%mm2 \n\t"\
558 "movq %3, %%mm3 \n\t"\
559 "punpcklbw %%mm7, %%mm0 \n\t"\
560 "punpckhbw %%mm7, %%mm1 \n\t"\
561 "punpcklbw %%mm7, %%mm2 \n\t"\
562 "punpckhbw %%mm7, %%mm3 \n\t"\
563 "psubw %%mm2, %%mm0 \n\t"\
564 "psubw %%mm3, %%mm1 \n\t"\
565 "movq %1, %%mm2 \n\t"\
566 "movq %1, %%mm3 \n\t"\
567 "movq %2, %%mm4 \n\t"\
568 "movq %2, %%mm5 \n\t"\
569 "punpcklbw %%mm7, %%mm2 \n\t"\
570 "punpckhbw %%mm7, %%mm3 \n\t"\
571 "punpcklbw %%mm7, %%mm4 \n\t"\
572 "punpckhbw %%mm7, %%mm5 \n\t"\
573 "psubw %%mm2, %%mm4 \n\t"\
574 "psubw %%mm3, %%mm5 \n\t"\
575 "psllw $2, %%mm4 \n\t"\
576 "psllw $2, %%mm5 \n\t"\
577 "paddw %%mm0, %%mm4 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "pxor %%mm6, %%mm6 \n\t"\
580 "pcmpgtw %%mm4, %%mm6 \n\t"\
581 "pcmpgtw %%mm5, %%mm7 \n\t"\
582 "pxor %%mm6, %%mm4 \n\t"\
583 "pxor %%mm7, %%mm5 \n\t"\
584 "psubw %%mm6, %%mm4 \n\t"\
585 "psubw %%mm7, %%mm5 \n\t"\
586 "psrlw $3, %%mm4 \n\t"\
587 "psrlw $3, %%mm5 \n\t"\
588 "packuswb %%mm5, %%mm4 \n\t"\
589 "packsswb %%mm7, %%mm6 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"\
591 "movd %4, %%mm2 \n\t"\
592 "punpcklbw %%mm2, %%mm2 \n\t"\
593 "punpcklbw %%mm2, %%mm2 \n\t"\
594 "punpcklbw %%mm2, %%mm2 \n\t"\
595 "psubusb %%mm4, %%mm2 \n\t"\
596 "movq %%mm2, %%mm3 \n\t"\
597 "psubusb %%mm4, %%mm3 \n\t"\
598 "psubb %%mm3, %%mm2 \n\t"\
599 "movq %1, %%mm3 \n\t"\
600 "movq %2, %%mm4 \n\t"\
601 "pxor %%mm6, %%mm3 \n\t"\
602 "pxor %%mm6, %%mm4 \n\t"\
603 "paddusb %%mm2, %%mm3 \n\t"\
604 "psubusb %%mm2, %%mm4 \n\t"\
605 "pxor %%mm6, %%mm3 \n\t"\
606 "pxor %%mm6, %%mm4 \n\t"\
607 "paddusb %%mm2, %%mm2 \n\t"\
608 "packsswb %%mm1, %%mm0 \n\t"\
609 "pcmpgtb %%mm0, %%mm7 \n\t"\
610 "pxor %%mm7, %%mm0 \n\t"\
611 "psubb %%mm7, %%mm0 \n\t"\
612 "movq %%mm0, %%mm1 \n\t"\
613 "psubusb %%mm2, %%mm0 \n\t"\
614 "psubb %%mm0, %%mm1 \n\t"\
615 "pand %5, %%mm1 \n\t"\
616 "psrlw $2, %%mm1 \n\t"\
617 "pxor %%mm7, %%mm1 \n\t"\
618 "psubb %%mm7, %%mm1 \n\t"\
619 "movq %0, %%mm5 \n\t"\
620 "movq %3, %%mm6 \n\t"\
621 "psubb %%mm1, %%mm5 \n\t"\
622 "paddb %%mm1, %%mm6 \n\t"
624 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
625 if(ENABLE_ANY_H263) {
626 const int strength= ff_h263_loop_filter_strength[qscale];
632 "movq %%mm3, %1 \n\t"
633 "movq %%mm4, %2 \n\t"
634 "movq %%mm5, %0 \n\t"
635 "movq %%mm6, %3 \n\t"
636 : "+m" (*(uint64_t*)(src - 2*stride)),
637 "+m" (*(uint64_t*)(src - 1*stride)),
638 "+m" (*(uint64_t*)(src + 0*stride)),
639 "+m" (*(uint64_t*)(src + 1*stride))
640 : "g" (2*strength), "m"(ff_pb_FC)
645 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
646 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
647 "movd %4, %%mm0 \n\t"
648 "movd %5, %%mm1 \n\t"
649 "movd %6, %%mm2 \n\t"
650 "movd %7, %%mm3 \n\t"
651 "punpcklbw %%mm1, %%mm0 \n\t"
652 "punpcklbw %%mm3, %%mm2 \n\t"
653 "movq %%mm0, %%mm1 \n\t"
654 "punpcklwd %%mm2, %%mm0 \n\t"
655 "punpckhwd %%mm2, %%mm1 \n\t"
656 "movd %%mm0, %0 \n\t"
657 "punpckhdq %%mm0, %%mm0 \n\t"
658 "movd %%mm0, %1 \n\t"
659 "movd %%mm1, %2 \n\t"
660 "punpckhdq %%mm1, %%mm1 \n\t"
661 "movd %%mm1, %3 \n\t"
663 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
664 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
665 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
666 "=m" (*(uint32_t*)(dst + 3*dst_stride))
667 : "m" (*(uint32_t*)(src + 0*src_stride)),
668 "m" (*(uint32_t*)(src + 1*src_stride)),
669 "m" (*(uint32_t*)(src + 2*src_stride)),
670 "m" (*(uint32_t*)(src + 3*src_stride))
674 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
675 if(ENABLE_ANY_H263) {
676 const int strength= ff_h263_loop_filter_strength[qscale];
677 uint64_t temp[4] __attribute__ ((aligned(8)));
678 uint8_t *btemp= (uint8_t*)temp;
682 transpose4x4(btemp , src , 8, stride);
683 transpose4x4(btemp+4, src + 4*stride, 8, stride);
685 H263_LOOP_FILTER // 5 3 4 6
691 : "g" (2*strength), "m"(ff_pb_FC)
695 "movq %%mm5, %%mm1 \n\t"
696 "movq %%mm4, %%mm0 \n\t"
697 "punpcklbw %%mm3, %%mm5 \n\t"
698 "punpcklbw %%mm6, %%mm4 \n\t"
699 "punpckhbw %%mm3, %%mm1 \n\t"
700 "punpckhbw %%mm6, %%mm0 \n\t"
701 "movq %%mm5, %%mm3 \n\t"
702 "movq %%mm1, %%mm6 \n\t"
703 "punpcklwd %%mm4, %%mm5 \n\t"
704 "punpcklwd %%mm0, %%mm1 \n\t"
705 "punpckhwd %%mm4, %%mm3 \n\t"
706 "punpckhwd %%mm0, %%mm6 \n\t"
707 "movd %%mm5, (%0) \n\t"
708 "punpckhdq %%mm5, %%mm5 \n\t"
709 "movd %%mm5, (%0,%2) \n\t"
710 "movd %%mm3, (%0,%2,2) \n\t"
711 "punpckhdq %%mm3, %%mm3 \n\t"
712 "movd %%mm3, (%0,%3) \n\t"
713 "movd %%mm1, (%1) \n\t"
714 "punpckhdq %%mm1, %%mm1 \n\t"
715 "movd %%mm1, (%1,%2) \n\t"
716 "movd %%mm6, (%1,%2,2) \n\t"
717 "punpckhdq %%mm6, %%mm6 \n\t"
718 "movd %%mm6, (%1,%3) \n\t"
720 "r" (src + 4*stride),
721 "r" ((long) stride ),
722 "r" ((long)(3*stride))
727 #ifdef CONFIG_ENCODERS
728 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
735 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
736 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
738 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
740 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
741 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
743 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
744 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
745 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
747 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
748 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
750 "pmaddwd %%mm3,%%mm3\n"
751 "pmaddwd %%mm4,%%mm4\n"
753 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
754 pix2^2+pix3^2+pix6^2+pix7^2) */
755 "paddd %%mm3,%%mm4\n"
756 "paddd %%mm2,%%mm7\n"
759 "paddd %%mm4,%%mm7\n"
764 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
765 "paddd %%mm7,%%mm1\n"
767 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
771 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
776 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
777 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
779 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
780 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
781 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
782 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
784 /* todo: mm1-mm2, mm3-mm4 */
785 /* algo: substract mm1 from mm2 with saturation and vice versa */
786 /* OR the results to get absolute difference */
789 "psubusb %%mm2,%%mm1\n"
790 "psubusb %%mm4,%%mm3\n"
791 "psubusb %%mm5,%%mm2\n"
792 "psubusb %%mm6,%%mm4\n"
797 /* now convert to 16-bit vectors so we can square them */
801 "punpckhbw %%mm0,%%mm2\n"
802 "punpckhbw %%mm0,%%mm4\n"
803 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
804 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
806 "pmaddwd %%mm2,%%mm2\n"
807 "pmaddwd %%mm4,%%mm4\n"
808 "pmaddwd %%mm1,%%mm1\n"
809 "pmaddwd %%mm3,%%mm3\n"
811 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
812 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
814 "paddd %%mm2,%%mm1\n"
815 "paddd %%mm4,%%mm3\n"
816 "paddd %%mm1,%%mm7\n"
817 "paddd %%mm3,%%mm7\n"
823 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
824 "paddd %%mm7,%%mm1\n"
826 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
827 : "r" ((long)line_size) , "m" (h)
832 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
836 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
837 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
839 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
840 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
841 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
842 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
844 /* todo: mm1-mm2, mm3-mm4 */
845 /* algo: substract mm1 from mm2 with saturation and vice versa */
846 /* OR the results to get absolute difference */
849 "psubusb %%mm2,%%mm1\n"
850 "psubusb %%mm4,%%mm3\n"
851 "psubusb %%mm5,%%mm2\n"
852 "psubusb %%mm6,%%mm4\n"
857 /* now convert to 16-bit vectors so we can square them */
861 "punpckhbw %%mm0,%%mm2\n"
862 "punpckhbw %%mm0,%%mm4\n"
863 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
864 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
866 "pmaddwd %%mm2,%%mm2\n"
867 "pmaddwd %%mm4,%%mm4\n"
868 "pmaddwd %%mm1,%%mm1\n"
869 "pmaddwd %%mm3,%%mm3\n"
874 "paddd %%mm2,%%mm1\n"
875 "paddd %%mm4,%%mm3\n"
876 "paddd %%mm1,%%mm7\n"
877 "paddd %%mm3,%%mm7\n"
883 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
884 "paddd %%mm7,%%mm1\n"
886 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
887 : "r" ((long)line_size) , "m" (h)
892 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
896 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
897 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
899 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
900 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
901 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
902 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
904 /* todo: mm1-mm2, mm3-mm4 */
905 /* algo: substract mm1 from mm2 with saturation and vice versa */
906 /* OR the results to get absolute difference */
907 "movdqa %%xmm1,%%xmm5\n"
908 "movdqa %%xmm3,%%xmm6\n"
909 "psubusb %%xmm2,%%xmm1\n"
910 "psubusb %%xmm4,%%xmm3\n"
911 "psubusb %%xmm5,%%xmm2\n"
912 "psubusb %%xmm6,%%xmm4\n"
914 "por %%xmm1,%%xmm2\n"
915 "por %%xmm3,%%xmm4\n"
917 /* now convert to 16-bit vectors so we can square them */
918 "movdqa %%xmm2,%%xmm1\n"
919 "movdqa %%xmm4,%%xmm3\n"
921 "punpckhbw %%xmm0,%%xmm2\n"
922 "punpckhbw %%xmm0,%%xmm4\n"
923 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
924 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
926 "pmaddwd %%xmm2,%%xmm2\n"
927 "pmaddwd %%xmm4,%%xmm4\n"
928 "pmaddwd %%xmm1,%%xmm1\n"
929 "pmaddwd %%xmm3,%%xmm3\n"
931 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
932 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
934 "paddd %%xmm2,%%xmm1\n"
935 "paddd %%xmm4,%%xmm3\n"
936 "paddd %%xmm1,%%xmm7\n"
937 "paddd %%xmm3,%%xmm7\n"
942 "movdqa %%xmm7,%%xmm1\n"
943 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
944 "paddd %%xmm1,%%xmm7\n"
945 "movdqa %%xmm7,%%xmm1\n"
946 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
947 "paddd %%xmm1,%%xmm7\n"
949 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
950 : "r" ((long)line_size));
954 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
962 "movq %%mm0, %%mm1\n"
966 "movq %%mm0, %%mm2\n"
967 "movq %%mm1, %%mm3\n"
968 "punpcklbw %%mm7,%%mm0\n"
969 "punpcklbw %%mm7,%%mm1\n"
970 "punpckhbw %%mm7,%%mm2\n"
971 "punpckhbw %%mm7,%%mm3\n"
972 "psubw %%mm1, %%mm0\n"
973 "psubw %%mm3, %%mm2\n"
978 "movq %%mm4, %%mm1\n"
982 "movq %%mm4, %%mm5\n"
983 "movq %%mm1, %%mm3\n"
984 "punpcklbw %%mm7,%%mm4\n"
985 "punpcklbw %%mm7,%%mm1\n"
986 "punpckhbw %%mm7,%%mm5\n"
987 "punpckhbw %%mm7,%%mm3\n"
988 "psubw %%mm1, %%mm4\n"
989 "psubw %%mm3, %%mm5\n"
990 "psubw %%mm4, %%mm0\n"
991 "psubw %%mm5, %%mm2\n"
992 "pxor %%mm3, %%mm3\n"
993 "pxor %%mm1, %%mm1\n"
994 "pcmpgtw %%mm0, %%mm3\n\t"
995 "pcmpgtw %%mm2, %%mm1\n\t"
996 "pxor %%mm3, %%mm0\n"
997 "pxor %%mm1, %%mm2\n"
998 "psubw %%mm3, %%mm0\n"
999 "psubw %%mm1, %%mm2\n"
1000 "paddw %%mm0, %%mm2\n"
1001 "paddw %%mm2, %%mm6\n"
1007 "movq %%mm0, %%mm1\n"
1011 "movq %%mm0, %%mm2\n"
1012 "movq %%mm1, %%mm3\n"
1013 "punpcklbw %%mm7,%%mm0\n"
1014 "punpcklbw %%mm7,%%mm1\n"
1015 "punpckhbw %%mm7,%%mm2\n"
1016 "punpckhbw %%mm7,%%mm3\n"
1017 "psubw %%mm1, %%mm0\n"
1018 "psubw %%mm3, %%mm2\n"
1019 "psubw %%mm0, %%mm4\n"
1020 "psubw %%mm2, %%mm5\n"
1021 "pxor %%mm3, %%mm3\n"
1022 "pxor %%mm1, %%mm1\n"
1023 "pcmpgtw %%mm4, %%mm3\n\t"
1024 "pcmpgtw %%mm5, %%mm1\n\t"
1025 "pxor %%mm3, %%mm4\n"
1026 "pxor %%mm1, %%mm5\n"
1027 "psubw %%mm3, %%mm4\n"
1028 "psubw %%mm1, %%mm5\n"
1029 "paddw %%mm4, %%mm5\n"
1030 "paddw %%mm5, %%mm6\n"
1035 "movq %%mm4, %%mm1\n"
1039 "movq %%mm4, %%mm5\n"
1040 "movq %%mm1, %%mm3\n"
1041 "punpcklbw %%mm7,%%mm4\n"
1042 "punpcklbw %%mm7,%%mm1\n"
1043 "punpckhbw %%mm7,%%mm5\n"
1044 "punpckhbw %%mm7,%%mm3\n"
1045 "psubw %%mm1, %%mm4\n"
1046 "psubw %%mm3, %%mm5\n"
1047 "psubw %%mm4, %%mm0\n"
1048 "psubw %%mm5, %%mm2\n"
1049 "pxor %%mm3, %%mm3\n"
1050 "pxor %%mm1, %%mm1\n"
1051 "pcmpgtw %%mm0, %%mm3\n\t"
1052 "pcmpgtw %%mm2, %%mm1\n\t"
1053 "pxor %%mm3, %%mm0\n"
1054 "pxor %%mm1, %%mm2\n"
1055 "psubw %%mm3, %%mm0\n"
1056 "psubw %%mm1, %%mm2\n"
1057 "paddw %%mm0, %%mm2\n"
1058 "paddw %%mm2, %%mm6\n"
1064 "movq %%mm6, %%mm0\n"
1065 "punpcklwd %%mm7,%%mm0\n"
1066 "punpckhwd %%mm7,%%mm6\n"
1067 "paddd %%mm0, %%mm6\n"
1069 "movq %%mm6,%%mm0\n"
1070 "psrlq $32, %%mm6\n"
1071 "paddd %%mm6,%%mm0\n"
1073 : "+r" (pix1), "=r"(tmp)
1074 : "r" ((long)line_size) , "g" (h-2)
1079 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1081 uint8_t * pix= pix1;
1084 "pxor %%mm7,%%mm7\n"
1085 "pxor %%mm6,%%mm6\n"
1088 "movq 1(%0),%%mm1\n"
1089 "movq %%mm0, %%mm2\n"
1090 "movq %%mm1, %%mm3\n"
1091 "punpcklbw %%mm7,%%mm0\n"
1092 "punpcklbw %%mm7,%%mm1\n"
1093 "punpckhbw %%mm7,%%mm2\n"
1094 "punpckhbw %%mm7,%%mm3\n"
1095 "psubw %%mm1, %%mm0\n"
1096 "psubw %%mm3, %%mm2\n"
1101 "movq 1(%0),%%mm1\n"
1102 "movq %%mm4, %%mm5\n"
1103 "movq %%mm1, %%mm3\n"
1104 "punpcklbw %%mm7,%%mm4\n"
1105 "punpcklbw %%mm7,%%mm1\n"
1106 "punpckhbw %%mm7,%%mm5\n"
1107 "punpckhbw %%mm7,%%mm3\n"
1108 "psubw %%mm1, %%mm4\n"
1109 "psubw %%mm3, %%mm5\n"
1110 "psubw %%mm4, %%mm0\n"
1111 "psubw %%mm5, %%mm2\n"
1112 "pxor %%mm3, %%mm3\n"
1113 "pxor %%mm1, %%mm1\n"
1114 "pcmpgtw %%mm0, %%mm3\n\t"
1115 "pcmpgtw %%mm2, %%mm1\n\t"
1116 "pxor %%mm3, %%mm0\n"
1117 "pxor %%mm1, %%mm2\n"
1118 "psubw %%mm3, %%mm0\n"
1119 "psubw %%mm1, %%mm2\n"
1120 "paddw %%mm0, %%mm2\n"
1121 "paddw %%mm2, %%mm6\n"
1127 "movq 1(%0),%%mm1\n"
1128 "movq %%mm0, %%mm2\n"
1129 "movq %%mm1, %%mm3\n"
1130 "punpcklbw %%mm7,%%mm0\n"
1131 "punpcklbw %%mm7,%%mm1\n"
1132 "punpckhbw %%mm7,%%mm2\n"
1133 "punpckhbw %%mm7,%%mm3\n"
1134 "psubw %%mm1, %%mm0\n"
1135 "psubw %%mm3, %%mm2\n"
1136 "psubw %%mm0, %%mm4\n"
1137 "psubw %%mm2, %%mm5\n"
1138 "pxor %%mm3, %%mm3\n"
1139 "pxor %%mm1, %%mm1\n"
1140 "pcmpgtw %%mm4, %%mm3\n\t"
1141 "pcmpgtw %%mm5, %%mm1\n\t"
1142 "pxor %%mm3, %%mm4\n"
1143 "pxor %%mm1, %%mm5\n"
1144 "psubw %%mm3, %%mm4\n"
1145 "psubw %%mm1, %%mm5\n"
1146 "paddw %%mm4, %%mm5\n"
1147 "paddw %%mm5, %%mm6\n"
1152 "movq 1(%0),%%mm1\n"
1153 "movq %%mm4, %%mm5\n"
1154 "movq %%mm1, %%mm3\n"
1155 "punpcklbw %%mm7,%%mm4\n"
1156 "punpcklbw %%mm7,%%mm1\n"
1157 "punpckhbw %%mm7,%%mm5\n"
1158 "punpckhbw %%mm7,%%mm3\n"
1159 "psubw %%mm1, %%mm4\n"
1160 "psubw %%mm3, %%mm5\n"
1161 "psubw %%mm4, %%mm0\n"
1162 "psubw %%mm5, %%mm2\n"
1163 "pxor %%mm3, %%mm3\n"
1164 "pxor %%mm1, %%mm1\n"
1165 "pcmpgtw %%mm0, %%mm3\n\t"
1166 "pcmpgtw %%mm2, %%mm1\n\t"
1167 "pxor %%mm3, %%mm0\n"
1168 "pxor %%mm1, %%mm2\n"
1169 "psubw %%mm3, %%mm0\n"
1170 "psubw %%mm1, %%mm2\n"
1171 "paddw %%mm0, %%mm2\n"
1172 "paddw %%mm2, %%mm6\n"
1178 "movq %%mm6, %%mm0\n"
1179 "punpcklwd %%mm7,%%mm0\n"
1180 "punpckhwd %%mm7,%%mm6\n"
1181 "paddd %%mm0, %%mm6\n"
1183 "movq %%mm6,%%mm0\n"
1184 "psrlq $32, %%mm6\n"
1185 "paddd %%mm6,%%mm0\n"
1187 : "+r" (pix1), "=r"(tmp)
1188 : "r" ((long)line_size) , "g" (h-2)
1190 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1193 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1194 MpegEncContext *c = p;
1197 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1198 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1199 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1201 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1202 else return score1 + FFABS(score2)*8;
1205 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1206 MpegEncContext *c = p;
1207 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1208 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1210 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1211 else return score1 + FFABS(score2)*8;
1214 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1217 assert( (((int)pix) & 7) == 0);
1218 assert((line_size &7) ==0);
1220 #define SUM(in0, in1, out0, out1) \
1221 "movq (%0), %%mm2\n"\
1222 "movq 8(%0), %%mm3\n"\
1224 "movq %%mm2, " #out0 "\n"\
1225 "movq %%mm3, " #out1 "\n"\
1226 "psubusb " #in0 ", %%mm2\n"\
1227 "psubusb " #in1 ", %%mm3\n"\
1228 "psubusb " #out0 ", " #in0 "\n"\
1229 "psubusb " #out1 ", " #in1 "\n"\
1230 "por %%mm2, " #in0 "\n"\
1231 "por %%mm3, " #in1 "\n"\
1232 "movq " #in0 ", %%mm2\n"\
1233 "movq " #in1 ", %%mm3\n"\
1234 "punpcklbw %%mm7, " #in0 "\n"\
1235 "punpcklbw %%mm7, " #in1 "\n"\
1236 "punpckhbw %%mm7, %%mm2\n"\
1237 "punpckhbw %%mm7, %%mm3\n"\
1238 "paddw " #in1 ", " #in0 "\n"\
1239 "paddw %%mm3, %%mm2\n"\
1240 "paddw %%mm2, " #in0 "\n"\
1241 "paddw " #in0 ", %%mm6\n"
1246 "pxor %%mm6,%%mm6\n"
1247 "pxor %%mm7,%%mm7\n"
1249 "movq 8(%0),%%mm1\n"
1252 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1255 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1257 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1262 "movq %%mm6,%%mm0\n"
1263 "psrlq $32, %%mm6\n"
1264 "paddw %%mm6,%%mm0\n"
1265 "movq %%mm0,%%mm6\n"
1266 "psrlq $16, %%mm0\n"
1267 "paddw %%mm6,%%mm0\n"
1269 : "+r" (pix), "=r"(tmp)
1270 : "r" ((long)line_size) , "m" (h)
1272 return tmp & 0xFFFF;
1276 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1279 assert( (((int)pix) & 7) == 0);
1280 assert((line_size &7) ==0);
1282 #define SUM(in0, in1, out0, out1) \
1283 "movq (%0), " #out0 "\n"\
1284 "movq 8(%0), " #out1 "\n"\
1286 "psadbw " #out0 ", " #in0 "\n"\
1287 "psadbw " #out1 ", " #in1 "\n"\
1288 "paddw " #in1 ", " #in0 "\n"\
1289 "paddw " #in0 ", %%mm6\n"
1293 "pxor %%mm6,%%mm6\n"
1294 "pxor %%mm7,%%mm7\n"
1296 "movq 8(%0),%%mm1\n"
1299 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1302 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1304 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1310 : "+r" (pix), "=r"(tmp)
1311 : "r" ((long)line_size) , "m" (h)
1317 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1320 assert( (((int)pix1) & 7) == 0);
1321 assert( (((int)pix2) & 7) == 0);
1322 assert((line_size &7) ==0);
1324 #define SUM(in0, in1, out0, out1) \
1325 "movq (%0),%%mm2\n"\
1326 "movq (%1)," #out0 "\n"\
1327 "movq 8(%0),%%mm3\n"\
1328 "movq 8(%1)," #out1 "\n"\
1331 "psubb " #out0 ", %%mm2\n"\
1332 "psubb " #out1 ", %%mm3\n"\
1333 "pxor %%mm7, %%mm2\n"\
1334 "pxor %%mm7, %%mm3\n"\
1335 "movq %%mm2, " #out0 "\n"\
1336 "movq %%mm3, " #out1 "\n"\
1337 "psubusb " #in0 ", %%mm2\n"\
1338 "psubusb " #in1 ", %%mm3\n"\
1339 "psubusb " #out0 ", " #in0 "\n"\
1340 "psubusb " #out1 ", " #in1 "\n"\
1341 "por %%mm2, " #in0 "\n"\
1342 "por %%mm3, " #in1 "\n"\
1343 "movq " #in0 ", %%mm2\n"\
1344 "movq " #in1 ", %%mm3\n"\
1345 "punpcklbw %%mm7, " #in0 "\n"\
1346 "punpcklbw %%mm7, " #in1 "\n"\
1347 "punpckhbw %%mm7, %%mm2\n"\
1348 "punpckhbw %%mm7, %%mm3\n"\
1349 "paddw " #in1 ", " #in0 "\n"\
1350 "paddw %%mm3, %%mm2\n"\
1351 "paddw %%mm2, " #in0 "\n"\
1352 "paddw " #in0 ", %%mm6\n"
1357 "pxor %%mm6,%%mm6\n"
1358 "pcmpeqw %%mm7,%%mm7\n"
1359 "psllw $15, %%mm7\n"
1360 "packsswb %%mm7, %%mm7\n"
1363 "movq 8(%0),%%mm1\n"
1364 "movq 8(%1),%%mm3\n"
1368 "psubb %%mm2, %%mm0\n"
1369 "psubb %%mm3, %%mm1\n"
1370 "pxor %%mm7, %%mm0\n"
1371 "pxor %%mm7, %%mm1\n"
1372 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1375 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1377 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1382 "movq %%mm6,%%mm0\n"
1383 "psrlq $32, %%mm6\n"
1384 "paddw %%mm6,%%mm0\n"
1385 "movq %%mm0,%%mm6\n"
1386 "psrlq $16, %%mm0\n"
1387 "paddw %%mm6,%%mm0\n"
1389 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1390 : "r" ((long)line_size) , "m" (h)
1392 return tmp & 0x7FFF;
1396 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1399 assert( (((int)pix1) & 7) == 0);
1400 assert( (((int)pix2) & 7) == 0);
1401 assert((line_size &7) ==0);
1403 #define SUM(in0, in1, out0, out1) \
1404 "movq (%0)," #out0 "\n"\
1405 "movq (%1),%%mm2\n"\
1406 "movq 8(%0)," #out1 "\n"\
1407 "movq 8(%1),%%mm3\n"\
1410 "psubb %%mm2, " #out0 "\n"\
1411 "psubb %%mm3, " #out1 "\n"\
1412 "pxor %%mm7, " #out0 "\n"\
1413 "pxor %%mm7, " #out1 "\n"\
1414 "psadbw " #out0 ", " #in0 "\n"\
1415 "psadbw " #out1 ", " #in1 "\n"\
1416 "paddw " #in1 ", " #in0 "\n"\
1417 "paddw " #in0 ", %%mm6\n"
1421 "pxor %%mm6,%%mm6\n"
1422 "pcmpeqw %%mm7,%%mm7\n"
1423 "psllw $15, %%mm7\n"
1424 "packsswb %%mm7, %%mm7\n"
1427 "movq 8(%0),%%mm1\n"
1428 "movq 8(%1),%%mm3\n"
1432 "psubb %%mm2, %%mm0\n"
1433 "psubb %%mm3, %%mm1\n"
1434 "pxor %%mm7, %%mm0\n"
1435 "pxor %%mm7, %%mm1\n"
1436 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1439 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1441 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1447 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1448 : "r" ((long)line_size) , "m" (h)
1454 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1458 "movq (%2, %0), %%mm0 \n\t"
1459 "movq (%1, %0), %%mm1 \n\t"
1460 "psubb %%mm0, %%mm1 \n\t"
1461 "movq %%mm1, (%3, %0) \n\t"
1462 "movq 8(%2, %0), %%mm0 \n\t"
1463 "movq 8(%1, %0), %%mm1 \n\t"
1464 "psubb %%mm0, %%mm1 \n\t"
1465 "movq %%mm1, 8(%3, %0) \n\t"
1470 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1473 dst[i+0] = src1[i+0]-src2[i+0];
1476 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1482 "movq -1(%1, %0), %%mm0 \n\t" // LT
1483 "movq (%1, %0), %%mm1 \n\t" // T
1484 "movq -1(%2, %0), %%mm2 \n\t" // L
1485 "movq (%2, %0), %%mm3 \n\t" // X
1486 "movq %%mm2, %%mm4 \n\t" // L
1487 "psubb %%mm0, %%mm2 \n\t"
1488 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1489 "movq %%mm4, %%mm5 \n\t" // L
1490 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1491 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1492 "pminub %%mm2, %%mm4 \n\t"
1493 "pmaxub %%mm1, %%mm4 \n\t"
1494 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1495 "movq %%mm3, (%3, %0) \n\t"
1500 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1506 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1508 *left_top= src1[w-1];
1512 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1513 "mov"#m" "#p1", "#a" \n\t"\
1514 "mov"#m" "#p2", "#t" \n\t"\
1515 "punpcklbw "#a", "#t" \n\t"\
1516 "punpcklbw "#a", "#a" \n\t"\
1517 "psubw "#t", "#a" \n\t"\
1519 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1520 uint8_t *p1b=p1, *p2b=p2;\
1522 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1523 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1524 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1527 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1528 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1529 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1530 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1531 "mov"#m1" "#mm"0, %0 \n\t"\
1532 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1533 "mov"#m1" %0, "#mm"0 \n\t"\
1534 : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1535 : "r"((long)stride), "r"((long)stride*3)\
1539 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1540 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1543 // permutes 01234567 -> 05736421
1544 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1545 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1546 SBUTTERFLY(c,d,b,wd,dqa)\
1547 SBUTTERFLY(e,f,d,wd,dqa)\
1548 SBUTTERFLY(g,h,f,wd,dqa)\
1549 SBUTTERFLY(a,c,h,dq,dqa)\
1550 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1551 SBUTTERFLY(e,g,b,dq,dqa)\
1552 SBUTTERFLY(d,f,g,dq,dqa)\
1553 SBUTTERFLY(a,e,f,qdq,dqa)\
1554 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1555 SBUTTERFLY(h,b,d,qdq,dqa)\
1556 SBUTTERFLY(c,g,b,qdq,dqa)\
1557 "movdqa %%xmm8, "#g" \n\t"
1559 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1560 "movdqa "#h", "#t" \n\t"\
1561 SBUTTERFLY(a,b,h,wd,dqa)\
1562 "movdqa "#h", 16"#t" \n\t"\
1563 "movdqa "#t", "#h" \n\t"\
1564 SBUTTERFLY(c,d,b,wd,dqa)\
1565 SBUTTERFLY(e,f,d,wd,dqa)\
1566 SBUTTERFLY(g,h,f,wd,dqa)\
1567 SBUTTERFLY(a,c,h,dq,dqa)\
1568 "movdqa "#h", "#t" \n\t"\
1569 "movdqa 16"#t", "#h" \n\t"\
1570 SBUTTERFLY(h,b,c,dq,dqa)\
1571 SBUTTERFLY(e,g,b,dq,dqa)\
1572 SBUTTERFLY(d,f,g,dq,dqa)\
1573 SBUTTERFLY(a,e,f,qdq,dqa)\
1574 SBUTTERFLY(h,d,e,qdq,dqa)\
1575 "movdqa "#h", 16"#t" \n\t"\
1576 "movdqa "#t", "#h" \n\t"\
1577 SBUTTERFLY(h,b,d,qdq,dqa)\
1578 SBUTTERFLY(c,g,b,qdq,dqa)\
1579 "movdqa 16"#t", "#g" \n\t"
1582 #define LBUTTERFLY2(a1,b1,a2,b2)\
1583 "paddw " #b1 ", " #a1 " \n\t"\
1584 "paddw " #b2 ", " #a2 " \n\t"\
1585 "paddw " #b1 ", " #b1 " \n\t"\
1586 "paddw " #b2 ", " #b2 " \n\t"\
1587 "psubw " #a1 ", " #b1 " \n\t"\
1588 "psubw " #a2 ", " #b2 " \n\t"
1590 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1591 LBUTTERFLY2(m0, m1, m2, m3)\
1592 LBUTTERFLY2(m4, m5, m6, m7)\
1593 LBUTTERFLY2(m0, m2, m1, m3)\
1594 LBUTTERFLY2(m4, m6, m5, m7)\
1595 LBUTTERFLY2(m0, m4, m1, m5)\
1596 LBUTTERFLY2(m2, m6, m3, m7)\
1598 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1600 #define MMABS_MMX(a,z)\
1601 "pxor " #z ", " #z " \n\t"\
1602 "pcmpgtw " #a ", " #z " \n\t"\
1603 "pxor " #z ", " #a " \n\t"\
1604 "psubw " #z ", " #a " \n\t"
1606 #define MMABS_MMX2(a,z)\
1607 "pxor " #z ", " #z " \n\t"\
1608 "psubw " #a ", " #z " \n\t"\
1609 "pmaxsw " #z ", " #a " \n\t"
1611 #define MMABS_SSSE3(a,z)\
1612 "pabsw " #a ", " #a " \n\t"
1614 #define MMABS_SUM(a,z, sum)\
1616 "paddusw " #a ", " #sum " \n\t"
1618 #define MMABS_SUM_8x8_NOSPILL\
1619 MMABS(%%xmm0, %%xmm8)\
1620 MMABS(%%xmm1, %%xmm9)\
1621 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1622 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1623 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1624 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1625 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1626 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1627 "paddusw %%xmm1, %%xmm0 \n\t"
1630 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1632 #define MMABS_SUM_8x8_SSE2\
1633 "movdqa %%xmm7, (%1) \n\t"\
1634 MMABS(%%xmm0, %%xmm7)\
1635 MMABS(%%xmm1, %%xmm7)\
1636 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1637 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1638 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1639 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1640 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1641 "movdqa (%1), %%xmm2 \n\t"\
1642 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1643 "paddusw %%xmm1, %%xmm0 \n\t"
1646 #define LOAD4(o, a, b, c, d)\
1647 "movq "#o"(%1), "#a" \n\t"\
1648 "movq "#o"+8(%1), "#b" \n\t"\
1649 "movq "#o"+16(%1), "#c" \n\t"\
1650 "movq "#o"+24(%1), "#d" \n\t"\
1652 #define STORE4(o, a, b, c, d)\
1653 "movq "#a", "#o"(%1) \n\t"\
1654 "movq "#b", "#o"+8(%1) \n\t"\
1655 "movq "#c", "#o"+16(%1) \n\t"\
1656 "movq "#d", "#o"+24(%1) \n\t"\
1658 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1659 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1660 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1661 #define HSUM_MMX(a, t, dst)\
1662 "movq "#a", "#t" \n\t"\
1663 "psrlq $32, "#a" \n\t"\
1664 "paddusw "#t", "#a" \n\t"\
1665 "movq "#a", "#t" \n\t"\
1666 "psrlq $16, "#a" \n\t"\
1667 "paddusw "#t", "#a" \n\t"\
1668 "movd "#a", "#dst" \n\t"\
1670 #define HSUM_MMX2(a, t, dst)\
1671 "pshufw $0x0E, "#a", "#t" \n\t"\
1672 "paddusw "#t", "#a" \n\t"\
1673 "pshufw $0x01, "#a", "#t" \n\t"\
1674 "paddusw "#t", "#a" \n\t"\
1675 "movd "#a", "#dst" \n\t"\
1677 #define HSUM_SSE2(a, t, dst)\
1678 "movhlps "#a", "#t" \n\t"\
1679 "paddusw "#t", "#a" \n\t"\
1680 "pshuflw $0x0E, "#a", "#t" \n\t"\
1681 "paddusw "#t", "#a" \n\t"\
1682 "pshuflw $0x01, "#a", "#t" \n\t"\
1683 "paddusw "#t", "#a" \n\t"\
1684 "movd "#a", "#dst" \n\t"\
1686 #define HADAMARD8_DIFF_MMX(cpu) \
1687 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1688 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1693 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1698 "movq %%mm7, 96(%1) \n\t"\
1700 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1701 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1703 "movq 96(%1), %%mm7 \n\t"\
1704 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1705 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1711 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1716 "movq %%mm7, 96(%1) \n\t"\
1718 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1719 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1721 "movq 96(%1), %%mm7 \n\t"\
1722 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1723 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1724 "movq %%mm6, %%mm7 \n\t"\
1725 "movq %%mm0, %%mm6 \n\t"\
1727 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1730 "movq %%mm7, 64(%1) \n\t"\
1731 MMABS(%%mm0, %%mm7)\
1732 MMABS(%%mm1, %%mm7)\
1733 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1734 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1735 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1736 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1737 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1738 "movq 64(%1), %%mm2 \n\t"\
1739 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1740 "paddusw %%mm1, %%mm0 \n\t"\
1741 "movq %%mm0, 64(%1) \n\t"\
1743 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1744 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1747 "movq %%mm7, (%1) \n\t"\
1748 MMABS(%%mm0, %%mm7)\
1749 MMABS(%%mm1, %%mm7)\
1750 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1751 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1752 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1753 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1754 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1755 "movq (%1), %%mm2 \n\t"\
1756 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1757 "paddusw 64(%1), %%mm0 \n\t"\
1758 "paddusw %%mm1, %%mm0 \n\t"\
1760 HSUM(%%mm0, %%mm1, %0)\
1767 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1769 #define HADAMARD8_DIFF_SSE2(cpu) \
1770 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1771 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1776 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1779 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1780 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1781 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1783 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1789 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1791 #define MMABS(a,z) MMABS_MMX(a,z)
1792 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1793 HADAMARD8_DIFF_MMX(mmx)
1797 #define MMABS(a,z) MMABS_MMX2(a,z)
1798 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1799 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1800 HADAMARD8_DIFF_MMX(mmx2)
1801 HADAMARD8_DIFF_SSE2(sse2)
1803 #undef MMABS_SUM_8x8
1807 #define MMABS(a,z) MMABS_SSSE3(a,z)
1808 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1809 HADAMARD8_DIFF_SSE2(ssse3)
1811 #undef MMABS_SUM_8x8
1814 #define DCT_SAD4(m,mm,o)\
1815 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1816 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1817 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1818 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1819 MMABS_SUM(mm##2, mm##6, mm##0)\
1820 MMABS_SUM(mm##3, mm##7, mm##1)\
1821 MMABS_SUM(mm##4, mm##6, mm##0)\
1822 MMABS_SUM(mm##5, mm##7, mm##1)\
1824 #define DCT_SAD_MMX\
1825 "pxor %%mm0, %%mm0 \n\t"\
1826 "pxor %%mm1, %%mm1 \n\t"\
1827 DCT_SAD4(q, %%mm, 0)\
1828 DCT_SAD4(q, %%mm, 8)\
1829 DCT_SAD4(q, %%mm, 64)\
1830 DCT_SAD4(q, %%mm, 72)\
1831 "paddusw %%mm1, %%mm0 \n\t"\
1832 HSUM(%%mm0, %%mm1, %0)
1834 #define DCT_SAD_SSE2\
1835 "pxor %%xmm0, %%xmm0 \n\t"\
1836 "pxor %%xmm1, %%xmm1 \n\t"\
1837 DCT_SAD4(dqa, %%xmm, 0)\
1838 DCT_SAD4(dqa, %%xmm, 64)\
1839 "paddusw %%xmm1, %%xmm0 \n\t"\
1840 HSUM(%%xmm0, %%xmm1, %0)
1842 #define DCT_SAD_FUNC(cpu) \
1843 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1853 #define DCT_SAD DCT_SAD_MMX
1854 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1855 #define MMABS(a,z) MMABS_MMX(a,z)
1860 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1861 #define MMABS(a,z) MMABS_MMX2(a,z)
1866 #define DCT_SAD DCT_SAD_SSE2
1867 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1872 #define MMABS(a,z) MMABS_SSSE3(a,z)
1879 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1883 "pxor %%mm4, %%mm4 \n"
1886 "movq (%2,%0), %%mm2 \n"
1887 "movq (%3,%0,2), %%mm0 \n"
1888 "movq 8(%3,%0,2), %%mm1 \n"
1889 "punpckhbw %%mm2, %%mm3 \n"
1890 "punpcklbw %%mm2, %%mm2 \n"
1891 "psraw $8, %%mm3 \n"
1892 "psraw $8, %%mm2 \n"
1893 "psubw %%mm3, %%mm1 \n"
1894 "psubw %%mm2, %%mm0 \n"
1895 "pmaddwd %%mm1, %%mm1 \n"
1896 "pmaddwd %%mm0, %%mm0 \n"
1897 "paddd %%mm1, %%mm4 \n"
1898 "paddd %%mm0, %%mm4 \n"
1900 "movq %%mm4, %%mm3 \n"
1901 "psrlq $32, %%mm3 \n"
1902 "paddd %%mm3, %%mm4 \n"
1905 :"r"(pix1), "r"(pix2)
1910 #endif //CONFIG_ENCODERS
1912 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1913 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1915 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1916 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1917 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1918 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1919 "movq "#in7", " #m3 " \n\t" /* d */\
1920 "movq "#in0", %%mm5 \n\t" /* D */\
1921 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1922 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1923 "movq "#in1", %%mm5 \n\t" /* C */\
1924 "movq "#in2", %%mm6 \n\t" /* B */\
1925 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1926 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1927 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1928 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1929 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1930 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1931 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1932 "psraw $5, %%mm5 \n\t"\
1933 "packuswb %%mm5, %%mm5 \n\t"\
1934 OP(%%mm5, out, %%mm7, d)
1936 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1937 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1941 "pxor %%mm7, %%mm7 \n\t"\
1943 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1944 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1945 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1946 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1947 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1948 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1949 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1950 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1951 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1952 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1953 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1954 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1955 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1956 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1957 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1958 "paddw %%mm3, %%mm5 \n\t" /* b */\
1959 "paddw %%mm2, %%mm6 \n\t" /* c */\
1960 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1961 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1962 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1963 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1964 "paddw %%mm4, %%mm0 \n\t" /* a */\
1965 "paddw %%mm1, %%mm5 \n\t" /* d */\
1966 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1967 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1968 "paddw %6, %%mm6 \n\t"\
1969 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1970 "psraw $5, %%mm0 \n\t"\
1971 "movq %%mm0, %5 \n\t"\
1972 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1974 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1975 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1976 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1977 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1978 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1979 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1980 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1981 "paddw %%mm0, %%mm2 \n\t" /* b */\
1982 "paddw %%mm5, %%mm3 \n\t" /* c */\
1983 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1984 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1985 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1986 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1987 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1988 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1989 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1990 "paddw %%mm2, %%mm1 \n\t" /* a */\
1991 "paddw %%mm6, %%mm4 \n\t" /* d */\
1992 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1993 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1994 "paddw %6, %%mm1 \n\t"\
1995 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1996 "psraw $5, %%mm3 \n\t"\
1997 "movq %5, %%mm1 \n\t"\
1998 "packuswb %%mm3, %%mm1 \n\t"\
1999 OP_MMX2(%%mm1, (%1),%%mm4, q)\
2000 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
2002 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
2003 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
2004 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
2005 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2006 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2007 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2008 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2009 "paddw %%mm1, %%mm5 \n\t" /* b */\
2010 "paddw %%mm4, %%mm0 \n\t" /* c */\
2011 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2012 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2013 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2014 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2015 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2016 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2017 "paddw %%mm3, %%mm2 \n\t" /* d */\
2018 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2019 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2020 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2021 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2022 "paddw %%mm2, %%mm6 \n\t" /* a */\
2023 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2024 "paddw %6, %%mm0 \n\t"\
2025 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2026 "psraw $5, %%mm0 \n\t"\
2027 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2029 "paddw %%mm5, %%mm3 \n\t" /* a */\
2030 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2031 "paddw %%mm4, %%mm6 \n\t" /* b */\
2032 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2033 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2034 "paddw %%mm1, %%mm4 \n\t" /* c */\
2035 "paddw %%mm2, %%mm5 \n\t" /* d */\
2036 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2037 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2038 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2039 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2040 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2041 "paddw %6, %%mm4 \n\t"\
2042 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2043 "psraw $5, %%mm4 \n\t"\
2044 "packuswb %%mm4, %%mm0 \n\t"\
2045 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2051 : "+a"(src), "+c"(dst), "+m"(h)\
2052 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2057 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2060 /* quick HACK, XXX FIXME MUST be optimized */\
2063 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2064 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2065 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2066 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2067 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2068 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2069 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2070 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2071 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2072 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2073 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2074 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2075 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2076 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2077 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2078 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2080 "movq (%0), %%mm0 \n\t"\
2081 "movq 8(%0), %%mm1 \n\t"\
2082 "paddw %2, %%mm0 \n\t"\
2083 "paddw %2, %%mm1 \n\t"\
2084 "psraw $5, %%mm0 \n\t"\
2085 "psraw $5, %%mm1 \n\t"\
2086 "packuswb %%mm1, %%mm0 \n\t"\
2087 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2088 "movq 16(%0), %%mm0 \n\t"\
2089 "movq 24(%0), %%mm1 \n\t"\
2090 "paddw %2, %%mm0 \n\t"\
2091 "paddw %2, %%mm1 \n\t"\
2092 "psraw $5, %%mm0 \n\t"\
2093 "psraw $5, %%mm1 \n\t"\
2094 "packuswb %%mm1, %%mm0 \n\t"\
2095 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2096 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2104 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2108 "pxor %%mm7, %%mm7 \n\t"\
2110 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2111 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2112 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2113 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2114 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2115 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2116 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2117 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2118 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2119 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2120 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2121 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2122 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2123 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2124 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2125 "paddw %%mm3, %%mm5 \n\t" /* b */\
2126 "paddw %%mm2, %%mm6 \n\t" /* c */\
2127 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2128 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2129 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2130 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2131 "paddw %%mm4, %%mm0 \n\t" /* a */\
2132 "paddw %%mm1, %%mm5 \n\t" /* d */\
2133 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2134 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2135 "paddw %6, %%mm6 \n\t"\
2136 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2137 "psraw $5, %%mm0 \n\t"\
2138 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2140 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2141 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2142 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2143 "paddw %%mm5, %%mm1 \n\t" /* a */\
2144 "paddw %%mm6, %%mm2 \n\t" /* b */\
2145 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2146 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2147 "paddw %%mm6, %%mm3 \n\t" /* c */\
2148 "paddw %%mm5, %%mm4 \n\t" /* d */\
2149 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2150 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2151 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2152 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2153 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2154 "paddw %6, %%mm1 \n\t"\
2155 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2156 "psraw $5, %%mm3 \n\t"\
2157 "packuswb %%mm3, %%mm0 \n\t"\
2158 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2164 : "+a"(src), "+c"(dst), "+m"(h)\
2165 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2170 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2173 /* quick HACK, XXX FIXME MUST be optimized */\
2176 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2177 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2178 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2179 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2180 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2181 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2182 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2183 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2185 "movq (%0), %%mm0 \n\t"\
2186 "movq 8(%0), %%mm1 \n\t"\
2187 "paddw %2, %%mm0 \n\t"\
2188 "paddw %2, %%mm1 \n\t"\
2189 "psraw $5, %%mm0 \n\t"\
2190 "psraw $5, %%mm1 \n\t"\
2191 "packuswb %%mm1, %%mm0 \n\t"\
2192 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2193 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2201 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2203 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204 uint64_t temp[17*4];\
2205 uint64_t *temp_ptr= temp;\
2210 "pxor %%mm7, %%mm7 \n\t"\
2212 "movq (%0), %%mm0 \n\t"\
2213 "movq (%0), %%mm1 \n\t"\
2214 "movq 8(%0), %%mm2 \n\t"\
2215 "movq 8(%0), %%mm3 \n\t"\
2216 "punpcklbw %%mm7, %%mm0 \n\t"\
2217 "punpckhbw %%mm7, %%mm1 \n\t"\
2218 "punpcklbw %%mm7, %%mm2 \n\t"\
2219 "punpckhbw %%mm7, %%mm3 \n\t"\
2220 "movq %%mm0, (%1) \n\t"\
2221 "movq %%mm1, 17*8(%1) \n\t"\
2222 "movq %%mm2, 2*17*8(%1) \n\t"\
2223 "movq %%mm3, 3*17*8(%1) \n\t"\
2228 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2229 : "r" ((long)srcStride)\
2236 /*FIXME reorder for speed */\
2238 /*"pxor %%mm7, %%mm7 \n\t"*/\
2240 "movq (%0), %%mm0 \n\t"\
2241 "movq 8(%0), %%mm1 \n\t"\
2242 "movq 16(%0), %%mm2 \n\t"\
2243 "movq 24(%0), %%mm3 \n\t"\
2244 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2245 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2247 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2249 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2270 "add $136, %0 \n\t"\
2275 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2276 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2281 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282 uint64_t temp[9*2];\
2283 uint64_t *temp_ptr= temp;\
2288 "pxor %%mm7, %%mm7 \n\t"\
2290 "movq (%0), %%mm0 \n\t"\
2291 "movq (%0), %%mm1 \n\t"\
2292 "punpcklbw %%mm7, %%mm0 \n\t"\
2293 "punpckhbw %%mm7, %%mm1 \n\t"\
2294 "movq %%mm0, (%1) \n\t"\
2295 "movq %%mm1, 9*8(%1) \n\t"\
2300 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2301 : "r" ((long)srcStride)\
2308 /*FIXME reorder for speed */\
2310 /*"pxor %%mm7, %%mm7 \n\t"*/\
2312 "movq (%0), %%mm0 \n\t"\
2313 "movq 8(%0), %%mm1 \n\t"\
2314 "movq 16(%0), %%mm2 \n\t"\
2315 "movq 24(%0), %%mm3 \n\t"\
2316 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2317 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2319 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2321 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2323 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2325 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2327 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2328 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2335 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2336 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2341 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2342 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2345 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2347 uint8_t * const half= (uint8_t*)temp;\
2348 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2349 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2352 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2353 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2356 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2358 uint8_t * const half= (uint8_t*)temp;\
2359 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2360 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2363 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2365 uint8_t * const half= (uint8_t*)temp;\
2366 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2367 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2370 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2371 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2374 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2376 uint8_t * const half= (uint8_t*)temp;\
2377 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2378 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2380 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2381 uint64_t half[8 + 9];\
2382 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2383 uint8_t * const halfHV= ((uint8_t*)half);\
2384 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2385 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2387 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2389 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2390 uint64_t half[8 + 9];\
2391 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2392 uint8_t * const halfHV= ((uint8_t*)half);\
2393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2394 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2396 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2398 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2399 uint64_t half[8 + 9];\
2400 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2401 uint8_t * const halfHV= ((uint8_t*)half);\
2402 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2403 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2404 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2405 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2407 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2408 uint64_t half[8 + 9];\
2409 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2410 uint8_t * const halfHV= ((uint8_t*)half);\
2411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2412 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2414 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2416 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2417 uint64_t half[8 + 9];\
2418 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2419 uint8_t * const halfHV= ((uint8_t*)half);\
2420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2421 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2422 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2424 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2425 uint64_t half[8 + 9];\
2426 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2427 uint8_t * const halfHV= ((uint8_t*)half);\
2428 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2429 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2430 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2432 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2433 uint64_t half[8 + 9];\
2434 uint8_t * const halfH= ((uint8_t*)half);\
2435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2436 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2437 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2439 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2440 uint64_t half[8 + 9];\
2441 uint8_t * const halfH= ((uint8_t*)half);\
2442 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2443 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2444 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2446 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2448 uint8_t * const halfH= ((uint8_t*)half);\
2449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2452 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2453 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2456 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2458 uint8_t * const half= (uint8_t*)temp;\
2459 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2460 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2463 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2464 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2467 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2469 uint8_t * const half= (uint8_t*)temp;\
2470 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2471 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2474 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2476 uint8_t * const half= (uint8_t*)temp;\
2477 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2478 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2481 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2482 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2485 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2487 uint8_t * const half= (uint8_t*)temp;\
2488 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2489 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2491 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2492 uint64_t half[16*2 + 17*2];\
2493 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2494 uint8_t * const halfHV= ((uint8_t*)half);\
2495 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2496 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2498 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2500 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2501 uint64_t half[16*2 + 17*2];\
2502 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2503 uint8_t * const halfHV= ((uint8_t*)half);\
2504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2505 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2509 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2510 uint64_t half[16*2 + 17*2];\
2511 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2512 uint8_t * const halfHV= ((uint8_t*)half);\
2513 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2514 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2515 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2516 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2518 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2519 uint64_t half[16*2 + 17*2];\
2520 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2521 uint8_t * const halfHV= ((uint8_t*)half);\
2522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2523 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2525 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2527 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2528 uint64_t half[16*2 + 17*2];\
2529 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2530 uint8_t * const halfHV= ((uint8_t*)half);\
2531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2532 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2533 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2535 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2536 uint64_t half[16*2 + 17*2];\
2537 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2538 uint8_t * const halfHV= ((uint8_t*)half);\
2539 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2540 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2541 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2543 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2544 uint64_t half[17*2];\
2545 uint8_t * const halfH= ((uint8_t*)half);\
2546 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2547 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2548 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2550 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2551 uint64_t half[17*2];\
2552 uint8_t * const halfH= ((uint8_t*)half);\
2553 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2554 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2555 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2557 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2558 uint64_t half[17*2];\
2559 uint8_t * const halfH= ((uint8_t*)half);\
2560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2564 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2565 #define AVG_3DNOW_OP(a,b,temp, size) \
2566 "mov" #size " " #b ", " #temp " \n\t"\
2567 "pavgusb " #temp ", " #a " \n\t"\
2568 "mov" #size " " #a ", " #b " \n\t"
2569 #define AVG_MMX2_OP(a,b,temp, size) \
2570 "mov" #size " " #b ", " #temp " \n\t"\
2571 "pavgb " #temp ", " #a " \n\t"\
2572 "mov" #size " " #a ", " #b " \n\t"
2574 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2575 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2576 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2577 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2578 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2579 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2580 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2581 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2584 /***********************************/
2585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2589 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2591 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2592 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2593 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2596 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2597 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2598 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2599 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2600 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2601 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2602 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2603 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2604 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2605 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2606 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2607 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2609 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2610 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2614 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2615 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2616 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2617 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2618 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2619 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2621 QPEL_2TAP(put_, 16, mmx2)
2622 QPEL_2TAP(avg_, 16, mmx2)
2623 QPEL_2TAP(put_, 8, mmx2)
2624 QPEL_2TAP(avg_, 8, mmx2)
2625 QPEL_2TAP(put_, 16, 3dnow)
2626 QPEL_2TAP(avg_, 16, 3dnow)
2627 QPEL_2TAP(put_, 8, 3dnow)
2628 QPEL_2TAP(avg_, 8, 3dnow)
2632 static void just_return() { return; }
2635 #define SET_QPEL_FUNC(postfix1, postfix2) \
2636 c->put_ ## postfix1 = put_ ## postfix2;\
2637 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2638 c->avg_ ## postfix1 = avg_ ## postfix2;
2640 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2641 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2643 const int ix = ox>>(16+shift);
2644 const int iy = oy>>(16+shift);
2645 const int oxs = ox>>4;
2646 const int oys = oy>>4;
2647 const int dxxs = dxx>>4;
2648 const int dxys = dxy>>4;
2649 const int dyxs = dyx>>4;
2650 const int dyys = dyy>>4;
2651 const uint16_t r4[4] = {r,r,r,r};
2652 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2653 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2654 const uint64_t shift2 = 2*shift;
2655 uint8_t edge_buf[(h+1)*stride];
2658 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2659 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2660 const int dxh = dxy*(h-1);
2661 const int dyw = dyx*(w-1);
2662 if( // non-constant fullpel offset (3% of blocks)
2663 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2664 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2665 // uses more than 16 bits of subpel mv (only at huge resolution)
2666 || (dxx|dxy|dyx|dyy)&15 )
2668 //FIXME could still use mmx for some of the rows
2669 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2673 src += ix + iy*stride;
2674 if( (unsigned)ix >= width-w ||
2675 (unsigned)iy >= height-h )
2677 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2682 "movd %0, %%mm6 \n\t"
2683 "pxor %%mm7, %%mm7 \n\t"
2684 "punpcklwd %%mm6, %%mm6 \n\t"
2685 "punpcklwd %%mm6, %%mm6 \n\t"
2689 for(x=0; x<w; x+=4){
2690 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2691 oxs - dxys + dxxs*(x+1),
2692 oxs - dxys + dxxs*(x+2),
2693 oxs - dxys + dxxs*(x+3) };
2694 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2695 oys - dyys + dyxs*(x+1),
2696 oys - dyys + dyxs*(x+2),
2697 oys - dyys + dyxs*(x+3) };
2701 "movq %0, %%mm4 \n\t"
2702 "movq %1, %%mm5 \n\t"
2703 "paddw %2, %%mm4 \n\t"
2704 "paddw %3, %%mm5 \n\t"
2705 "movq %%mm4, %0 \n\t"
2706 "movq %%mm5, %1 \n\t"
2707 "psrlw $12, %%mm4 \n\t"
2708 "psrlw $12, %%mm5 \n\t"
2709 : "+m"(*dx4), "+m"(*dy4)
2710 : "m"(*dxy4), "m"(*dyy4)
2714 "movq %%mm6, %%mm2 \n\t"
2715 "movq %%mm6, %%mm1 \n\t"
2716 "psubw %%mm4, %%mm2 \n\t"
2717 "psubw %%mm5, %%mm1 \n\t"
2718 "movq %%mm2, %%mm0 \n\t"
2719 "movq %%mm4, %%mm3 \n\t"
2720 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2721 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2722 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2723 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2725 "movd %4, %%mm5 \n\t"
2726 "movd %3, %%mm4 \n\t"
2727 "punpcklbw %%mm7, %%mm5 \n\t"
2728 "punpcklbw %%mm7, %%mm4 \n\t"
2729 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2730 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2732 "movd %2, %%mm5 \n\t"
2733 "movd %1, %%mm4 \n\t"
2734 "punpcklbw %%mm7, %%mm5 \n\t"
2735 "punpcklbw %%mm7, %%mm4 \n\t"
2736 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2737 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2738 "paddw %5, %%mm1 \n\t"
2739 "paddw %%mm3, %%mm2 \n\t"
2740 "paddw %%mm1, %%mm0 \n\t"
2741 "paddw %%mm2, %%mm0 \n\t"
2743 "psrlw %6, %%mm0 \n\t"
2744 "packuswb %%mm0, %%mm0 \n\t"
2745 "movd %%mm0, %0 \n\t"
2747 : "=m"(dst[x+y*stride])
2748 : "m"(src[0]), "m"(src[1]),
2749 "m"(src[stride]), "m"(src[stride+1]),
2750 "m"(*r4), "m"(shift2)
2758 #ifdef CONFIG_ENCODERS
2760 #define PHADDD(a, t)\
2761 "movq "#a", "#t" \n\t"\
2762 "psrlq $32, "#a" \n\t"\
2763 "paddd "#t", "#a" \n\t"
2765 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2766 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2767 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2769 #define PMULHRW(x, y, s, o)\
2770 "pmulhw " #s ", "#x " \n\t"\
2771 "pmulhw " #s ", "#y " \n\t"\
2772 "paddw " #o ", "#x " \n\t"\
2773 "paddw " #o ", "#y " \n\t"\
2774 "psraw $1, "#x " \n\t"\
2775 "psraw $1, "#y " \n\t"
2776 #define DEF(x) x ## _mmx
2777 #define SET_RND MOVQ_WONE
2778 #define SCALE_OFFSET 1
2780 #include "dsputil_mmx_qns.h"
2787 #define DEF(x) x ## _3dnow
2789 #define SCALE_OFFSET 0
2790 #define PMULHRW(x, y, s, o)\
2791 "pmulhrw " #s ", "#x " \n\t"\
2792 "pmulhrw " #s ", "#y " \n\t"
2794 #include "dsputil_mmx_qns.h"
2803 #define DEF(x) x ## _ssse3
2805 #define SCALE_OFFSET -1
2806 #define PHADDD(a, t)\
2807 "pshufw $0x0E, "#a", "#t" \n\t"\
2808 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
2809 #define PMULHRW(x, y, s, o)\
2810 "pmulhrsw " #s ", "#x " \n\t"\
2811 "pmulhrsw " #s ", "#y " \n\t"
2813 #include "dsputil_mmx_qns.h"
2822 #endif /* CONFIG_ENCODERS */
2824 #define PREFETCH(name, op) \
2825 static void name(void *mem, int stride, int h){\
2826 const uint8_t *p= mem;\
2828 asm volatile(#op" %0" :: "m"(*p));\
2832 PREFETCH(prefetch_mmx2, prefetcht0)
2833 PREFETCH(prefetch_3dnow, prefetch)
2836 #include "h264dsp_mmx.c"
2839 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2841 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2842 put_pixels8_mmx(dst, src, stride, 8);
2844 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2845 avg_pixels8_mmx(dst, src, stride, 8);
2847 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2848 put_pixels16_mmx(dst, src, stride, 16);
2850 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2851 avg_pixels16_mmx(dst, src, stride, 16);
2854 /* external functions, from idct_mmx.c */
2855 void ff_mmx_idct(DCTELEM *block);
2856 void ff_mmxext_idct(DCTELEM *block);
2858 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2861 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2863 ff_mmx_idct (block);
2864 put_pixels_clamped_mmx(block, dest, line_size);
2866 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2868 ff_mmx_idct (block);
2869 add_pixels_clamped_mmx(block, dest, line_size);
2871 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2873 ff_mmxext_idct (block);
2874 put_pixels_clamped_mmx(block, dest, line_size);
2876 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2878 ff_mmxext_idct (block);
2879 add_pixels_clamped_mmx(block, dest, line_size);
2882 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2884 ff_idct_xvid_mmx (block);
2885 put_pixels_clamped_mmx(block, dest, line_size);
2887 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2889 ff_idct_xvid_mmx (block);
2890 add_pixels_clamped_mmx(block, dest, line_size);
2892 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2894 ff_idct_xvid_mmx2 (block);
2895 put_pixels_clamped_mmx(block, dest, line_size);
2897 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2899 ff_idct_xvid_mmx2 (block);
2900 add_pixels_clamped_mmx(block, dest, line_size);
2903 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2906 asm volatile("pxor %%mm7, %%mm7":);
2907 for(i=0; i<blocksize; i+=2) {
2909 "movq %0, %%mm0 \n\t"
2910 "movq %1, %%mm1 \n\t"
2911 "movq %%mm0, %%mm2 \n\t"
2912 "movq %%mm1, %%mm3 \n\t"
2913 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2914 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2915 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2916 "pxor %%mm2, %%mm1 \n\t"
2917 "movq %%mm3, %%mm4 \n\t"
2918 "pand %%mm1, %%mm3 \n\t"
2919 "pandn %%mm1, %%mm4 \n\t"
2920 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2921 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2922 "movq %%mm3, %1 \n\t"
2923 "movq %%mm0, %0 \n\t"
2924 :"+m"(mag[i]), "+m"(ang[i])
2928 asm volatile("femms");
2930 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2935 "movaps %0, %%xmm5 \n\t"
2936 ::"m"(ff_pdw_80000000[0])
2938 for(i=0; i<blocksize; i+=4) {
2940 "movaps %0, %%xmm0 \n\t"
2941 "movaps %1, %%xmm1 \n\t"
2942 "xorps %%xmm2, %%xmm2 \n\t"
2943 "xorps %%xmm3, %%xmm3 \n\t"
2944 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2945 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2946 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2947 "xorps %%xmm2, %%xmm1 \n\t"
2948 "movaps %%xmm3, %%xmm4 \n\t"
2949 "andps %%xmm1, %%xmm3 \n\t"
2950 "andnps %%xmm1, %%xmm4 \n\t"
2951 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2952 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2953 "movaps %%xmm3, %1 \n\t"
2954 "movaps %%xmm0, %0 \n\t"
2955 :"+m"(mag[i]), "+m"(ang[i])
2961 static void vector_fmul_3dnow(float *dst, const float *src, int len){
2965 "movq (%1,%0), %%mm0 \n\t"
2966 "movq 8(%1,%0), %%mm1 \n\t"
2967 "pfmul (%2,%0), %%mm0 \n\t"
2968 "pfmul 8(%2,%0), %%mm1 \n\t"
2969 "movq %%mm0, (%1,%0) \n\t"
2970 "movq %%mm1, 8(%1,%0) \n\t"
2979 static void vector_fmul_sse(float *dst, const float *src, int len){
2983 "movaps (%1,%0), %%xmm0 \n\t"
2984 "movaps 16(%1,%0), %%xmm1 \n\t"
2985 "mulps (%2,%0), %%xmm0 \n\t"
2986 "mulps 16(%2,%0), %%xmm1 \n\t"
2987 "movaps %%xmm0, (%1,%0) \n\t"
2988 "movaps %%xmm1, 16(%1,%0) \n\t"
2997 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3001 "pswapd 8(%1), %%mm0 \n\t"
3002 "pswapd (%1), %%mm1 \n\t"
3003 "pfmul (%3,%0), %%mm0 \n\t"
3004 "pfmul 8(%3,%0), %%mm1 \n\t"
3005 "movq %%mm0, (%2,%0) \n\t"
3006 "movq %%mm1, 8(%2,%0) \n\t"
3010 :"+r"(i), "+r"(src1)
3011 :"r"(dst), "r"(src0)
3013 asm volatile("femms");
3015 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3019 "movaps 16(%1), %%xmm0 \n\t"
3020 "movaps (%1), %%xmm1 \n\t"
3021 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3022 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3023 "mulps (%3,%0), %%xmm0 \n\t"
3024 "mulps 16(%3,%0), %%xmm1 \n\t"
3025 "movaps %%xmm0, (%2,%0) \n\t"
3026 "movaps %%xmm1, 16(%2,%0) \n\t"
3030 :"+r"(i), "+r"(src1)
3031 :"r"(dst), "r"(src0)
3035 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3036 const float *src2, int src3, int len, int step){
3038 if(step == 2 && src3 == 0){
3042 "movq (%2,%0), %%mm0 \n\t"
3043 "movq 8(%2,%0), %%mm1 \n\t"
3044 "pfmul (%3,%0), %%mm0 \n\t"
3045 "pfmul 8(%3,%0), %%mm1 \n\t"
3046 "pfadd (%4,%0), %%mm0 \n\t"
3047 "pfadd 8(%4,%0), %%mm1 \n\t"
3048 "movd %%mm0, (%1) \n\t"
3049 "movd %%mm1, 16(%1) \n\t"
3050 "psrlq $32, %%mm0 \n\t"
3051 "psrlq $32, %%mm1 \n\t"
3052 "movd %%mm0, 8(%1) \n\t"
3053 "movd %%mm1, 24(%1) \n\t"
3058 :"r"(src0), "r"(src1), "r"(src2)
3062 else if(step == 1 && src3 == 0){
3065 "movq (%2,%0), %%mm0 \n\t"
3066 "movq 8(%2,%0), %%mm1 \n\t"
3067 "pfmul (%3,%0), %%mm0 \n\t"
3068 "pfmul 8(%3,%0), %%mm1 \n\t"
3069 "pfadd (%4,%0), %%mm0 \n\t"
3070 "pfadd 8(%4,%0), %%mm1 \n\t"
3071 "movq %%mm0, (%1,%0) \n\t"
3072 "movq %%mm1, 8(%1,%0) \n\t"
3076 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3081 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3082 asm volatile("femms");
3084 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3085 const float *src2, int src3, int len, int step){
3087 if(step == 2 && src3 == 0){
3091 "movaps (%2,%0), %%xmm0 \n\t"
3092 "movaps 16(%2,%0), %%xmm1 \n\t"
3093 "mulps (%3,%0), %%xmm0 \n\t"
3094 "mulps 16(%3,%0), %%xmm1 \n\t"
3095 "addps (%4,%0), %%xmm0 \n\t"
3096 "addps 16(%4,%0), %%xmm1 \n\t"
3097 "movss %%xmm0, (%1) \n\t"
3098 "movss %%xmm1, 32(%1) \n\t"
3099 "movhlps %%xmm0, %%xmm2 \n\t"
3100 "movhlps %%xmm1, %%xmm3 \n\t"
3101 "movss %%xmm2, 16(%1) \n\t"
3102 "movss %%xmm3, 48(%1) \n\t"
3103 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3104 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3105 "movss %%xmm0, 8(%1) \n\t"
3106 "movss %%xmm1, 40(%1) \n\t"
3107 "movhlps %%xmm0, %%xmm2 \n\t"
3108 "movhlps %%xmm1, %%xmm3 \n\t"
3109 "movss %%xmm2, 24(%1) \n\t"
3110 "movss %%xmm3, 56(%1) \n\t"
3115 :"r"(src0), "r"(src1), "r"(src2)
3119 else if(step == 1 && src3 == 0){
3122 "movaps (%2,%0), %%xmm0 \n\t"
3123 "movaps 16(%2,%0), %%xmm1 \n\t"
3124 "mulps (%3,%0), %%xmm0 \n\t"
3125 "mulps 16(%3,%0), %%xmm1 \n\t"
3126 "addps (%4,%0), %%xmm0 \n\t"
3127 "addps 16(%4,%0), %%xmm1 \n\t"
3128 "movaps %%xmm0, (%1,%0) \n\t"
3129 "movaps %%xmm1, 16(%1,%0) \n\t"
3133 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3138 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3141 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3142 // not bit-exact: pf2id uses different rounding than C and SSE
3144 for(i=0; i<len; i+=4) {
3146 "pf2id %1, %%mm0 \n\t"
3147 "pf2id %2, %%mm1 \n\t"
3148 "packssdw %%mm1, %%mm0 \n\t"
3149 "movq %%mm0, %0 \n\t"
3151 :"m"(src[i]), "m"(src[i+2])
3154 asm volatile("femms");
3156 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3158 for(i=0; i<len; i+=4) {
3160 "cvtps2pi %1, %%mm0 \n\t"
3161 "cvtps2pi %2, %%mm1 \n\t"
3162 "packssdw %%mm1, %%mm0 \n\t"
3163 "movq %%mm0, %0 \n\t"
3165 :"m"(src[i]), "m"(src[i+2])
3168 asm volatile("emms");
3171 #ifdef CONFIG_SNOW_DECODER
3172 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3173 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3174 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3175 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3176 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3177 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3178 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3179 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3182 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3184 mm_flags = mm_support();
3186 if (avctx->dsp_mask) {
3187 if (avctx->dsp_mask & FF_MM_FORCE)
3188 mm_flags |= (avctx->dsp_mask & 0xffff);
3190 mm_flags &= ~(avctx->dsp_mask & 0xffff);
3194 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3195 if (mm_flags & MM_MMX)
3196 av_log(avctx, AV_LOG_INFO, " mmx");
3197 if (mm_flags & MM_MMXEXT)
3198 av_log(avctx, AV_LOG_INFO, " mmxext");
3199 if (mm_flags & MM_3DNOW)
3200 av_log(avctx, AV_LOG_INFO, " 3dnow");
3201 if (mm_flags & MM_SSE)
3202 av_log(avctx, AV_LOG_INFO, " sse");
3203 if (mm_flags & MM_SSE2)
3204 av_log(avctx, AV_LOG_INFO, " sse2");
3205 av_log(avctx, AV_LOG_INFO, "\n");
3208 if (mm_flags & MM_MMX) {
3209 const int idct_algo= avctx->idct_algo;
3211 #ifdef CONFIG_ENCODERS
3212 const int dct_algo = avctx->dct_algo;
3213 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3214 if(mm_flags & MM_SSE2){
3215 c->fdct = ff_fdct_sse2;
3216 }else if(mm_flags & MM_MMXEXT){
3217 c->fdct = ff_fdct_mmx2;
3219 c->fdct = ff_fdct_mmx;
3222 #endif //CONFIG_ENCODERS
3223 if(avctx->lowres==0){
3224 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3225 c->idct_put= ff_simple_idct_put_mmx;
3226 c->idct_add= ff_simple_idct_add_mmx;
3227 c->idct = ff_simple_idct_mmx;
3228 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3230 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3231 if(mm_flags & MM_MMXEXT){
3232 c->idct_put= ff_libmpeg2mmx2_idct_put;
3233 c->idct_add= ff_libmpeg2mmx2_idct_add;
3234 c->idct = ff_mmxext_idct;
3236 c->idct_put= ff_libmpeg2mmx_idct_put;
3237 c->idct_add= ff_libmpeg2mmx_idct_add;
3238 c->idct = ff_mmx_idct;
3240 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3242 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3243 idct_algo==FF_IDCT_VP3 &&
3244 avctx->codec->id!=CODEC_ID_THEORA &&
3245 !(avctx->flags & CODEC_FLAG_BITEXACT)){
3246 if(mm_flags & MM_SSE2){
3247 c->idct_put= ff_vp3_idct_put_sse2;
3248 c->idct_add= ff_vp3_idct_add_sse2;
3249 c->idct = ff_vp3_idct_sse2;
3250 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3252 ff_vp3_dsp_init_mmx();
3253 c->idct_put= ff_vp3_idct_put_mmx;
3254 c->idct_add= ff_vp3_idct_add_mmx;
3255 c->idct = ff_vp3_idct_mmx;
3256 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3258 }else if(idct_algo==FF_IDCT_CAVS){
3259 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3260 }else if(idct_algo==FF_IDCT_XVIDMMX){
3261 if(mm_flags & MM_MMXEXT){
3262 c->idct_put= ff_idct_xvid_mmx2_put;
3263 c->idct_add= ff_idct_xvid_mmx2_add;
3264 c->idct = ff_idct_xvid_mmx2;
3266 c->idct_put= ff_idct_xvid_mmx_put;
3267 c->idct_add= ff_idct_xvid_mmx_add;
3268 c->idct = ff_idct_xvid_mmx;
3273 #ifdef CONFIG_ENCODERS
3274 c->get_pixels = get_pixels_mmx;
3275 c->diff_pixels = diff_pixels_mmx;
3276 #endif //CONFIG_ENCODERS
3277 c->put_pixels_clamped = put_pixels_clamped_mmx;
3278 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3279 c->add_pixels_clamped = add_pixels_clamped_mmx;
3280 c->clear_blocks = clear_blocks_mmx;
3281 #ifdef CONFIG_ENCODERS
3282 c->pix_sum = pix_sum16_mmx;
3283 #endif //CONFIG_ENCODERS
3285 c->put_pixels_tab[0][0] = put_pixels16_mmx;
3286 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
3287 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
3288 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
3290 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
3291 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
3292 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
3293 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
3295 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
3296 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
3297 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
3298 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
3300 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
3301 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
3302 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
3303 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
3305 c->put_pixels_tab[1][0] = put_pixels8_mmx;
3306 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
3307 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
3308 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
3310 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
3311 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
3312 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
3313 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
3315 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
3316 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
3317 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
3318 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
3320 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
3321 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
3322 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
3323 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
3327 c->add_bytes= add_bytes_mmx;
3328 #ifdef CONFIG_ENCODERS
3329 c->diff_bytes= diff_bytes_mmx;
3330 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
3332 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
3333 c->hadamard8_diff[1]= hadamard8_diff_mmx;
3335 c->pix_norm1 = pix_norm1_mmx;
3336 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
3337 c->sse[1] = sse8_mmx;
3338 c->vsad[4]= vsad_intra16_mmx;
3340 c->nsse[0] = nsse16_mmx;
3341 c->nsse[1] = nsse8_mmx;
3342 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3343 c->vsad[0] = vsad16_mmx;
3346 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3347 c->try_8x8basis= try_8x8basis_mmx;
3349 c->add_8x8basis= add_8x8basis_mmx;
3351 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
3353 #endif //CONFIG_ENCODERS
3355 if (ENABLE_ANY_H263) {
3356 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
3357 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
3359 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
3360 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
3362 c->h264_idct_dc_add=
3363 c->h264_idct_add= ff_h264_idct_add_mmx;
3364 c->h264_idct8_dc_add=
3365 c->h264_idct8_add= ff_h264_idct8_add_mmx;
3367 if (mm_flags & MM_MMXEXT) {
3368 c->prefetch = prefetch_mmx2;
3370 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
3371 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
3373 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
3374 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
3375 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
3377 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
3378 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
3380 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
3381 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
3382 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
3384 #ifdef CONFIG_ENCODERS
3385 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
3386 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
3387 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
3388 c->vsad[4]= vsad_intra16_mmx2;
3389 #endif //CONFIG_ENCODERS
3391 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
3392 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
3394 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3395 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
3396 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
3397 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
3398 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
3399 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
3400 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3401 #ifdef CONFIG_ENCODERS
3402 c->vsad[0] = vsad16_mmx2;
3403 #endif //CONFIG_ENCODERS
3407 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
3408 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
3409 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
3410 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
3411 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
3412 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
3413 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
3414 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
3415 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
3416 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
3417 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
3418 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
3419 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
3420 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
3421 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
3422 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
3423 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
3424 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
3425 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
3426 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
3427 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
3428 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
3429 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
3430 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
3431 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
3432 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
3433 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
3434 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
3435 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
3436 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
3437 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
3438 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
3442 #define dspfunc(PFX, IDX, NUM) \
3443 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
3444 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
3445 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
3446 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
3447 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
3448 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
3449 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
3450 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
3451 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
3452 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
3453 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
3454 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
3455 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
3456 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
3457 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
3458 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
3460 dspfunc(put_h264_qpel, 0, 16);
3461 dspfunc(put_h264_qpel, 1, 8);
3462 dspfunc(put_h264_qpel, 2, 4);
3463 dspfunc(avg_h264_qpel, 0, 16);
3464 dspfunc(avg_h264_qpel, 1, 8);
3465 dspfunc(avg_h264_qpel, 2, 4);
3467 dspfunc(put_2tap_qpel, 0, 16);
3468 dspfunc(put_2tap_qpel, 1, 8);
3469 dspfunc(avg_2tap_qpel, 0, 16);
3470 dspfunc(avg_2tap_qpel, 1, 8);
3473 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
3474 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3475 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
3476 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
3477 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
3478 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
3479 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
3480 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
3481 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
3482 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3483 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
3485 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3486 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3487 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3488 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3489 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3490 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3491 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3492 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3494 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3495 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3496 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3497 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3498 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3499 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3500 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3501 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3503 #ifdef CONFIG_CAVS_DECODER
3504 ff_cavsdsp_init_mmx2(c, avctx);
3507 #ifdef CONFIG_ENCODERS
3508 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3509 #endif //CONFIG_ENCODERS
3510 } else if (mm_flags & MM_3DNOW) {
3511 c->prefetch = prefetch_3dnow;
3513 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
3514 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
3516 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
3517 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
3518 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
3520 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
3521 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
3523 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
3524 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
3525 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
3527 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3528 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
3529 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
3530 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
3531 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
3532 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
3533 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
3536 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
3537 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
3538 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
3539 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
3540 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
3541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
3542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
3543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
3544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
3545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
3546 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
3547 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
3548 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
3549 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
3550 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
3551 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
3552 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
3553 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
3554 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
3555 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
3556 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
3557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
3558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
3559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
3560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
3561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
3562 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
3563 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
3564 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
3565 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
3566 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
3567 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
3569 #define dspfunc(PFX, IDX, NUM) \
3570 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
3571 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
3572 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
3573 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
3574 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
3575 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
3576 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
3577 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
3578 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
3579 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
3580 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
3581 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
3582 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
3583 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
3584 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
3585 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
3587 dspfunc(put_h264_qpel, 0, 16);
3588 dspfunc(put_h264_qpel, 1, 8);
3589 dspfunc(put_h264_qpel, 2, 4);
3590 dspfunc(avg_h264_qpel, 0, 16);
3591 dspfunc(avg_h264_qpel, 1, 8);
3592 dspfunc(avg_h264_qpel, 2, 4);
3594 dspfunc(put_2tap_qpel, 0, 16);
3595 dspfunc(put_2tap_qpel, 1, 8);
3596 dspfunc(avg_2tap_qpel, 0, 16);
3597 dspfunc(avg_2tap_qpel, 1, 8);
3599 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
3600 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
3603 #ifdef CONFIG_ENCODERS
3604 if(mm_flags & MM_SSE2){
3605 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3606 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3607 c->hadamard8_diff[1]= hadamard8_diff_sse2;
3611 if(mm_flags & MM_SSSE3){
3612 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3613 c->try_8x8basis= try_8x8basis_ssse3;
3615 c->add_8x8basis= add_8x8basis_ssse3;
3616 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
3617 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
3618 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
3623 #ifdef CONFIG_SNOW_DECODER
3624 if(mm_flags & MM_SSE2 & 0){
3625 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3627 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3629 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3632 if(mm_flags & MM_MMXEXT){
3633 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3635 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3638 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3642 if(mm_flags & MM_3DNOW){
3643 #ifdef CONFIG_ENCODERS
3644 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3645 c->try_8x8basis= try_8x8basis_3dnow;
3647 c->add_8x8basis= add_8x8basis_3dnow;
3648 #endif //CONFIG_ENCODERS
3649 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
3650 c->vector_fmul = vector_fmul_3dnow;
3651 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
3652 c->float_to_int16 = float_to_int16_3dnow;
3654 if(mm_flags & MM_3DNOWEXT)
3655 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
3656 if(mm_flags & MM_SSE){
3657 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3658 c->vector_fmul = vector_fmul_sse;
3659 c->float_to_int16 = float_to_int16_sse;
3660 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3661 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3663 if(mm_flags & MM_3DNOW)
3664 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
3667 #ifdef CONFIG_ENCODERS
3668 dsputil_init_pix_mmx(c, avctx);
3669 #endif //CONFIG_ENCODERS
3671 // for speed testing
3672 get_pixels = just_return;
3673 put_pixels_clamped = just_return;
3674 add_pixels_clamped = just_return;
3676 pix_abs16x16 = just_return;
3677 pix_abs16x16_x2 = just_return;
3678 pix_abs16x16_y2 = just_return;
3679 pix_abs16x16_xy2 = just_return;
3681 put_pixels_tab[0] = just_return;
3682 put_pixels_tab[1] = just_return;
3683 put_pixels_tab[2] = just_return;
3684 put_pixels_tab[3] = just_return;
3686 put_no_rnd_pixels_tab[0] = just_return;
3687 put_no_rnd_pixels_tab[1] = just_return;
3688 put_no_rnd_pixels_tab[2] = just_return;
3689 put_no_rnd_pixels_tab[3] = just_return;
3691 avg_pixels_tab[0] = just_return;
3692 avg_pixels_tab[1] = just_return;
3693 avg_pixels_tab[2] = just_return;
3694 avg_pixels_tab[3] = just_return;
3696 avg_no_rnd_pixels_tab[0] = just_return;
3697 avg_no_rnd_pixels_tab[1] = just_return;
3698 avg_no_rnd_pixels_tab[2] = just_return;
3699 avg_no_rnd_pixels_tab[3] = just_return;
3701 //av_fdct = just_return;
3702 //ff_idct = just_return;