2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "simple_idct.h"
27 #include "mpegvideo.h"
30 #include "vp3dsp_mmx.h"
31 #include "vp3dsp_sse2.h"
36 extern void ff_idct_xvid_mmx(short *block);
37 extern void ff_idct_xvid_mmx2(short *block);
39 int mm_flags; /* multimedia extension flags */
41 /* pixel operations */
42 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
43 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
44 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
46 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
47 {0x8000000080000000ULL, 0x8000000080000000ULL};
49 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
50 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
51 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
52 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
53 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
54 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
55 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
56 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
57 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
59 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
60 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
61 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
62 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
63 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
64 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
65 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
67 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
68 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
70 #define MOVQ_WONE(regd) \
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
73 "psrlw $15, %%" #regd ::)
75 #define MOVQ_BFE(regd) \
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
78 "paddb %%" #regd ", %%" #regd " \n\t" ::)
81 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
82 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
84 // for shared library it's better to use this way for accessing constants
86 #define MOVQ_BONE(regd) \
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
92 #define MOVQ_WTWO(regd) \
94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95 "psrlw $15, %%" #regd " \n\t" \
96 "psllw $1, %%" #regd " \n\t"::)
100 // using regr as temporary and for the output result
101 // first argument is unmodifed and second is trashed
102 // regfe is supposed to contain 0xfefefefefefefefe
103 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
104 "movq " #rega ", " #regr " \n\t"\
105 "pand " #regb ", " #regr " \n\t"\
106 "pxor " #rega ", " #regb " \n\t"\
107 "pand " #regfe "," #regb " \n\t"\
108 "psrlq $1, " #regb " \n\t"\
109 "paddb " #regb ", " #regr " \n\t"
111 #define PAVGB_MMX(rega, regb, regr, regfe) \
112 "movq " #rega ", " #regr " \n\t"\
113 "por " #regb ", " #regr " \n\t"\
114 "pxor " #rega ", " #regb " \n\t"\
115 "pand " #regfe "," #regb " \n\t"\
116 "psrlq $1, " #regb " \n\t"\
117 "psubb " #regb ", " #regr " \n\t"
119 // mm6 is supposed to contain 0xfefefefefefefefe
120 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
121 "movq " #rega ", " #regr " \n\t"\
122 "movq " #regc ", " #regp " \n\t"\
123 "pand " #regb ", " #regr " \n\t"\
124 "pand " #regd ", " #regp " \n\t"\
125 "pxor " #rega ", " #regb " \n\t"\
126 "pxor " #regc ", " #regd " \n\t"\
127 "pand %%mm6, " #regb " \n\t"\
128 "pand %%mm6, " #regd " \n\t"\
129 "psrlq $1, " #regb " \n\t"\
130 "psrlq $1, " #regd " \n\t"\
131 "paddb " #regb ", " #regr " \n\t"\
132 "paddb " #regd ", " #regp " \n\t"
134 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
135 "movq " #rega ", " #regr " \n\t"\
136 "movq " #regc ", " #regp " \n\t"\
137 "por " #regb ", " #regr " \n\t"\
138 "por " #regd ", " #regp " \n\t"\
139 "pxor " #rega ", " #regb " \n\t"\
140 "pxor " #regc ", " #regd " \n\t"\
141 "pand %%mm6, " #regb " \n\t"\
142 "pand %%mm6, " #regd " \n\t"\
143 "psrlq $1, " #regd " \n\t"\
144 "psrlq $1, " #regb " \n\t"\
145 "psubb " #regb ", " #regr " \n\t"\
146 "psubb " #regd ", " #regp " \n\t"
148 /***********************************/
149 /* MMX no rounding */
150 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
151 #define SET_RND MOVQ_WONE
152 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
153 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
155 #include "dsputil_mmx_rnd.h"
161 /***********************************/
164 #define DEF(x, y) x ## _ ## y ##_mmx
165 #define SET_RND MOVQ_WTWO
166 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
167 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
169 #include "dsputil_mmx_rnd.h"
176 /***********************************/
179 #define DEF(x) x ## _3dnow
180 /* for Athlons PAVGUSB is preferred */
181 #define PAVGB "pavgusb"
183 #include "dsputil_mmx_avg.h"
188 /***********************************/
191 #define DEF(x) x ## _mmx2
193 /* Introduced only in MMX2 set */
194 #define PAVGB "pavgb"
196 #include "dsputil_mmx_avg.h"
201 #define SBUTTERFLY(a,b,t,n,m)\
202 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
203 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
204 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
206 #define TRANSPOSE4(a,b,c,d,t)\
207 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
208 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
209 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
210 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
212 /***********************************/
215 #ifdef CONFIG_ENCODERS
216 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
219 "mov $-128, %%"REG_a" \n\t"
220 "pxor %%mm7, %%mm7 \n\t"
223 "movq (%0), %%mm0 \n\t"
224 "movq (%0, %2), %%mm2 \n\t"
225 "movq %%mm0, %%mm1 \n\t"
226 "movq %%mm2, %%mm3 \n\t"
227 "punpcklbw %%mm7, %%mm0 \n\t"
228 "punpckhbw %%mm7, %%mm1 \n\t"
229 "punpcklbw %%mm7, %%mm2 \n\t"
230 "punpckhbw %%mm7, %%mm3 \n\t"
231 "movq %%mm0, (%1, %%"REG_a") \n\t"
232 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
233 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
234 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
236 "add $32, %%"REG_a" \n\t"
239 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
244 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
247 "pxor %%mm7, %%mm7 \n\t"
248 "mov $-128, %%"REG_a" \n\t"
251 "movq (%0), %%mm0 \n\t"
252 "movq (%1), %%mm2 \n\t"
253 "movq %%mm0, %%mm1 \n\t"
254 "movq %%mm2, %%mm3 \n\t"
255 "punpcklbw %%mm7, %%mm0 \n\t"
256 "punpckhbw %%mm7, %%mm1 \n\t"
257 "punpcklbw %%mm7, %%mm2 \n\t"
258 "punpckhbw %%mm7, %%mm3 \n\t"
259 "psubw %%mm2, %%mm0 \n\t"
260 "psubw %%mm3, %%mm1 \n\t"
261 "movq %%mm0, (%2, %%"REG_a") \n\t"
262 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
265 "add $16, %%"REG_a" \n\t"
267 : "+r" (s1), "+r" (s2)
268 : "r" (block+64), "r" ((long)stride)
272 #endif //CONFIG_ENCODERS
274 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
279 /* read the pixels */
284 "movq %3, %%mm0 \n\t"
285 "movq 8%3, %%mm1 \n\t"
286 "movq 16%3, %%mm2 \n\t"
287 "movq 24%3, %%mm3 \n\t"
288 "movq 32%3, %%mm4 \n\t"
289 "movq 40%3, %%mm5 \n\t"
290 "movq 48%3, %%mm6 \n\t"
291 "movq 56%3, %%mm7 \n\t"
292 "packuswb %%mm1, %%mm0 \n\t"
293 "packuswb %%mm3, %%mm2 \n\t"
294 "packuswb %%mm5, %%mm4 \n\t"
295 "packuswb %%mm7, %%mm6 \n\t"
296 "movq %%mm0, (%0) \n\t"
297 "movq %%mm2, (%0, %1) \n\t"
298 "movq %%mm4, (%0, %1, 2) \n\t"
299 "movq %%mm6, (%0, %2) \n\t"
300 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
305 // if here would be an exact copy of the code above
306 // compiler would generate some very strange code
309 "movq (%3), %%mm0 \n\t"
310 "movq 8(%3), %%mm1 \n\t"
311 "movq 16(%3), %%mm2 \n\t"
312 "movq 24(%3), %%mm3 \n\t"
313 "movq 32(%3), %%mm4 \n\t"
314 "movq 40(%3), %%mm5 \n\t"
315 "movq 48(%3), %%mm6 \n\t"
316 "movq 56(%3), %%mm7 \n\t"
317 "packuswb %%mm1, %%mm0 \n\t"
318 "packuswb %%mm3, %%mm2 \n\t"
319 "packuswb %%mm5, %%mm4 \n\t"
320 "packuswb %%mm7, %%mm6 \n\t"
321 "movq %%mm0, (%0) \n\t"
322 "movq %%mm2, (%0, %1) \n\t"
323 "movq %%mm4, (%0, %1, 2) \n\t"
324 "movq %%mm6, (%0, %2) \n\t"
325 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
329 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
330 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
332 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
336 movq_m2r(*vector128, mm1);
337 for (i = 0; i < 8; i++) {
338 movq_m2r(*(block), mm0);
339 packsswb_m2r(*(block + 4), mm0);
342 movq_r2m(mm0, *pixels);
347 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
353 /* read the pixels */
360 "movq (%2), %%mm0 \n\t"
361 "movq 8(%2), %%mm1 \n\t"
362 "movq 16(%2), %%mm2 \n\t"
363 "movq 24(%2), %%mm3 \n\t"
364 "movq %0, %%mm4 \n\t"
365 "movq %1, %%mm6 \n\t"
366 "movq %%mm4, %%mm5 \n\t"
367 "punpcklbw %%mm7, %%mm4 \n\t"
368 "punpckhbw %%mm7, %%mm5 \n\t"
369 "paddsw %%mm4, %%mm0 \n\t"
370 "paddsw %%mm5, %%mm1 \n\t"
371 "movq %%mm6, %%mm5 \n\t"
372 "punpcklbw %%mm7, %%mm6 \n\t"
373 "punpckhbw %%mm7, %%mm5 \n\t"
374 "paddsw %%mm6, %%mm2 \n\t"
375 "paddsw %%mm5, %%mm3 \n\t"
376 "packuswb %%mm1, %%mm0 \n\t"
377 "packuswb %%mm3, %%mm2 \n\t"
378 "movq %%mm0, %0 \n\t"
379 "movq %%mm2, %1 \n\t"
380 :"+m"(*pix), "+m"(*(pix+line_size))
388 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
391 "lea (%3, %3), %%"REG_a" \n\t"
394 "movd (%1), %%mm0 \n\t"
395 "movd (%1, %3), %%mm1 \n\t"
396 "movd %%mm0, (%2) \n\t"
397 "movd %%mm1, (%2, %3) \n\t"
398 "add %%"REG_a", %1 \n\t"
399 "add %%"REG_a", %2 \n\t"
400 "movd (%1), %%mm0 \n\t"
401 "movd (%1, %3), %%mm1 \n\t"
402 "movd %%mm0, (%2) \n\t"
403 "movd %%mm1, (%2, %3) \n\t"
404 "add %%"REG_a", %1 \n\t"
405 "add %%"REG_a", %2 \n\t"
408 : "+g"(h), "+r" (pixels), "+r" (block)
409 : "r"((long)line_size)
414 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
417 "lea (%3, %3), %%"REG_a" \n\t"
420 "movq (%1), %%mm0 \n\t"
421 "movq (%1, %3), %%mm1 \n\t"
422 "movq %%mm0, (%2) \n\t"
423 "movq %%mm1, (%2, %3) \n\t"
424 "add %%"REG_a", %1 \n\t"
425 "add %%"REG_a", %2 \n\t"
426 "movq (%1), %%mm0 \n\t"
427 "movq (%1, %3), %%mm1 \n\t"
428 "movq %%mm0, (%2) \n\t"
429 "movq %%mm1, (%2, %3) \n\t"
430 "add %%"REG_a", %1 \n\t"
431 "add %%"REG_a", %2 \n\t"
434 : "+g"(h), "+r" (pixels), "+r" (block)
435 : "r"((long)line_size)
440 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
443 "lea (%3, %3), %%"REG_a" \n\t"
446 "movq (%1), %%mm0 \n\t"
447 "movq 8(%1), %%mm4 \n\t"
448 "movq (%1, %3), %%mm1 \n\t"
449 "movq 8(%1, %3), %%mm5 \n\t"
450 "movq %%mm0, (%2) \n\t"
451 "movq %%mm4, 8(%2) \n\t"
452 "movq %%mm1, (%2, %3) \n\t"
453 "movq %%mm5, 8(%2, %3) \n\t"
454 "add %%"REG_a", %1 \n\t"
455 "add %%"REG_a", %2 \n\t"
456 "movq (%1), %%mm0 \n\t"
457 "movq 8(%1), %%mm4 \n\t"
458 "movq (%1, %3), %%mm1 \n\t"
459 "movq 8(%1, %3), %%mm5 \n\t"
460 "movq %%mm0, (%2) \n\t"
461 "movq %%mm4, 8(%2) \n\t"
462 "movq %%mm1, (%2, %3) \n\t"
463 "movq %%mm5, 8(%2, %3) \n\t"
464 "add %%"REG_a", %1 \n\t"
465 "add %%"REG_a", %2 \n\t"
468 : "+g"(h), "+r" (pixels), "+r" (block)
469 : "r"((long)line_size)
474 static void clear_blocks_mmx(DCTELEM *blocks)
477 "pxor %%mm7, %%mm7 \n\t"
478 "mov $-128*6, %%"REG_a" \n\t"
480 "movq %%mm7, (%0, %%"REG_a") \n\t"
481 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
482 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
483 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
484 "add $32, %%"REG_a" \n\t"
486 : : "r" (((uint8_t *)blocks)+128*6)
491 #ifdef CONFIG_ENCODERS
492 static int pix_sum16_mmx(uint8_t * pix, int line_size){
495 long index= -line_size*h;
498 "pxor %%mm7, %%mm7 \n\t"
499 "pxor %%mm6, %%mm6 \n\t"
501 "movq (%2, %1), %%mm0 \n\t"
502 "movq (%2, %1), %%mm1 \n\t"
503 "movq 8(%2, %1), %%mm2 \n\t"
504 "movq 8(%2, %1), %%mm3 \n\t"
505 "punpcklbw %%mm7, %%mm0 \n\t"
506 "punpckhbw %%mm7, %%mm1 \n\t"
507 "punpcklbw %%mm7, %%mm2 \n\t"
508 "punpckhbw %%mm7, %%mm3 \n\t"
509 "paddw %%mm0, %%mm1 \n\t"
510 "paddw %%mm2, %%mm3 \n\t"
511 "paddw %%mm1, %%mm3 \n\t"
512 "paddw %%mm3, %%mm6 \n\t"
515 "movq %%mm6, %%mm5 \n\t"
516 "psrlq $32, %%mm6 \n\t"
517 "paddw %%mm5, %%mm6 \n\t"
518 "movq %%mm6, %%mm5 \n\t"
519 "psrlq $16, %%mm6 \n\t"
520 "paddw %%mm5, %%mm6 \n\t"
521 "movd %%mm6, %0 \n\t"
522 "andl $0xFFFF, %0 \n\t"
523 : "=&r" (sum), "+r" (index)
524 : "r" (pix - index), "r" ((long)line_size)
529 #endif //CONFIG_ENCODERS
531 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
535 "movq (%1, %0), %%mm0 \n\t"
536 "movq (%2, %0), %%mm1 \n\t"
537 "paddb %%mm0, %%mm1 \n\t"
538 "movq %%mm1, (%2, %0) \n\t"
539 "movq 8(%1, %0), %%mm0 \n\t"
540 "movq 8(%2, %0), %%mm1 \n\t"
541 "paddb %%mm0, %%mm1 \n\t"
542 "movq %%mm1, 8(%2, %0) \n\t"
547 : "r"(src), "r"(dst), "r"((long)w-15)
550 dst[i+0] += src[i+0];
553 #define H263_LOOP_FILTER \
554 "pxor %%mm7, %%mm7 \n\t"\
555 "movq %0, %%mm0 \n\t"\
556 "movq %0, %%mm1 \n\t"\
557 "movq %3, %%mm2 \n\t"\
558 "movq %3, %%mm3 \n\t"\
559 "punpcklbw %%mm7, %%mm0 \n\t"\
560 "punpckhbw %%mm7, %%mm1 \n\t"\
561 "punpcklbw %%mm7, %%mm2 \n\t"\
562 "punpckhbw %%mm7, %%mm3 \n\t"\
563 "psubw %%mm2, %%mm0 \n\t"\
564 "psubw %%mm3, %%mm1 \n\t"\
565 "movq %1, %%mm2 \n\t"\
566 "movq %1, %%mm3 \n\t"\
567 "movq %2, %%mm4 \n\t"\
568 "movq %2, %%mm5 \n\t"\
569 "punpcklbw %%mm7, %%mm2 \n\t"\
570 "punpckhbw %%mm7, %%mm3 \n\t"\
571 "punpcklbw %%mm7, %%mm4 \n\t"\
572 "punpckhbw %%mm7, %%mm5 \n\t"\
573 "psubw %%mm2, %%mm4 \n\t"\
574 "psubw %%mm3, %%mm5 \n\t"\
575 "psllw $2, %%mm4 \n\t"\
576 "psllw $2, %%mm5 \n\t"\
577 "paddw %%mm0, %%mm4 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "pxor %%mm6, %%mm6 \n\t"\
580 "pcmpgtw %%mm4, %%mm6 \n\t"\
581 "pcmpgtw %%mm5, %%mm7 \n\t"\
582 "pxor %%mm6, %%mm4 \n\t"\
583 "pxor %%mm7, %%mm5 \n\t"\
584 "psubw %%mm6, %%mm4 \n\t"\
585 "psubw %%mm7, %%mm5 \n\t"\
586 "psrlw $3, %%mm4 \n\t"\
587 "psrlw $3, %%mm5 \n\t"\
588 "packuswb %%mm5, %%mm4 \n\t"\
589 "packsswb %%mm7, %%mm6 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"\
591 "movd %4, %%mm2 \n\t"\
592 "punpcklbw %%mm2, %%mm2 \n\t"\
593 "punpcklbw %%mm2, %%mm2 \n\t"\
594 "punpcklbw %%mm2, %%mm2 \n\t"\
595 "psubusb %%mm4, %%mm2 \n\t"\
596 "movq %%mm2, %%mm3 \n\t"\
597 "psubusb %%mm4, %%mm3 \n\t"\
598 "psubb %%mm3, %%mm2 \n\t"\
599 "movq %1, %%mm3 \n\t"\
600 "movq %2, %%mm4 \n\t"\
601 "pxor %%mm6, %%mm3 \n\t"\
602 "pxor %%mm6, %%mm4 \n\t"\
603 "paddusb %%mm2, %%mm3 \n\t"\
604 "psubusb %%mm2, %%mm4 \n\t"\
605 "pxor %%mm6, %%mm3 \n\t"\
606 "pxor %%mm6, %%mm4 \n\t"\
607 "paddusb %%mm2, %%mm2 \n\t"\
608 "packsswb %%mm1, %%mm0 \n\t"\
609 "pcmpgtb %%mm0, %%mm7 \n\t"\
610 "pxor %%mm7, %%mm0 \n\t"\
611 "psubb %%mm7, %%mm0 \n\t"\
612 "movq %%mm0, %%mm1 \n\t"\
613 "psubusb %%mm2, %%mm0 \n\t"\
614 "psubb %%mm0, %%mm1 \n\t"\
615 "pand %5, %%mm1 \n\t"\
616 "psrlw $2, %%mm1 \n\t"\
617 "pxor %%mm7, %%mm1 \n\t"\
618 "psubb %%mm7, %%mm1 \n\t"\
619 "movq %0, %%mm5 \n\t"\
620 "movq %3, %%mm6 \n\t"\
621 "psubb %%mm1, %%mm5 \n\t"\
622 "paddb %%mm1, %%mm6 \n\t"
624 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
625 const int strength= ff_h263_loop_filter_strength[qscale];
631 "movq %%mm3, %1 \n\t"
632 "movq %%mm4, %2 \n\t"
633 "movq %%mm5, %0 \n\t"
634 "movq %%mm6, %3 \n\t"
635 : "+m" (*(uint64_t*)(src - 2*stride)),
636 "+m" (*(uint64_t*)(src - 1*stride)),
637 "+m" (*(uint64_t*)(src + 0*stride)),
638 "+m" (*(uint64_t*)(src + 1*stride))
639 : "g" (2*strength), "m"(ff_pb_FC)
643 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
644 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
645 "movd %4, %%mm0 \n\t"
646 "movd %5, %%mm1 \n\t"
647 "movd %6, %%mm2 \n\t"
648 "movd %7, %%mm3 \n\t"
649 "punpcklbw %%mm1, %%mm0 \n\t"
650 "punpcklbw %%mm3, %%mm2 \n\t"
651 "movq %%mm0, %%mm1 \n\t"
652 "punpcklwd %%mm2, %%mm0 \n\t"
653 "punpckhwd %%mm2, %%mm1 \n\t"
654 "movd %%mm0, %0 \n\t"
655 "punpckhdq %%mm0, %%mm0 \n\t"
656 "movd %%mm0, %1 \n\t"
657 "movd %%mm1, %2 \n\t"
658 "punpckhdq %%mm1, %%mm1 \n\t"
659 "movd %%mm1, %3 \n\t"
661 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
662 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
663 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
664 "=m" (*(uint32_t*)(dst + 3*dst_stride))
665 : "m" (*(uint32_t*)(src + 0*src_stride)),
666 "m" (*(uint32_t*)(src + 1*src_stride)),
667 "m" (*(uint32_t*)(src + 2*src_stride)),
668 "m" (*(uint32_t*)(src + 3*src_stride))
672 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
673 const int strength= ff_h263_loop_filter_strength[qscale];
674 uint64_t temp[4] __attribute__ ((aligned(8)));
675 uint8_t *btemp= (uint8_t*)temp;
679 transpose4x4(btemp , src , 8, stride);
680 transpose4x4(btemp+4, src + 4*stride, 8, stride);
682 H263_LOOP_FILTER // 5 3 4 6
688 : "g" (2*strength), "m"(ff_pb_FC)
692 "movq %%mm5, %%mm1 \n\t"
693 "movq %%mm4, %%mm0 \n\t"
694 "punpcklbw %%mm3, %%mm5 \n\t"
695 "punpcklbw %%mm6, %%mm4 \n\t"
696 "punpckhbw %%mm3, %%mm1 \n\t"
697 "punpckhbw %%mm6, %%mm0 \n\t"
698 "movq %%mm5, %%mm3 \n\t"
699 "movq %%mm1, %%mm6 \n\t"
700 "punpcklwd %%mm4, %%mm5 \n\t"
701 "punpcklwd %%mm0, %%mm1 \n\t"
702 "punpckhwd %%mm4, %%mm3 \n\t"
703 "punpckhwd %%mm0, %%mm6 \n\t"
704 "movd %%mm5, (%0) \n\t"
705 "punpckhdq %%mm5, %%mm5 \n\t"
706 "movd %%mm5, (%0,%2) \n\t"
707 "movd %%mm3, (%0,%2,2) \n\t"
708 "punpckhdq %%mm3, %%mm3 \n\t"
709 "movd %%mm3, (%0,%3) \n\t"
710 "movd %%mm1, (%1) \n\t"
711 "punpckhdq %%mm1, %%mm1 \n\t"
712 "movd %%mm1, (%1,%2) \n\t"
713 "movd %%mm6, (%1,%2,2) \n\t"
714 "punpckhdq %%mm6, %%mm6 \n\t"
715 "movd %%mm6, (%1,%3) \n\t"
717 "r" (src + 4*stride),
718 "r" ((long) stride ),
719 "r" ((long)(3*stride))
723 #ifdef CONFIG_ENCODERS
724 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
731 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
732 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
734 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
736 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
737 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
739 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
740 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
741 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
743 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
744 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
746 "pmaddwd %%mm3,%%mm3\n"
747 "pmaddwd %%mm4,%%mm4\n"
749 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
750 pix2^2+pix3^2+pix6^2+pix7^2) */
751 "paddd %%mm3,%%mm4\n"
752 "paddd %%mm2,%%mm7\n"
755 "paddd %%mm4,%%mm7\n"
760 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
761 "paddd %%mm7,%%mm1\n"
763 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
767 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
772 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
773 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
775 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
776 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
777 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
778 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
780 /* todo: mm1-mm2, mm3-mm4 */
781 /* algo: substract mm1 from mm2 with saturation and vice versa */
782 /* OR the results to get absolute difference */
785 "psubusb %%mm2,%%mm1\n"
786 "psubusb %%mm4,%%mm3\n"
787 "psubusb %%mm5,%%mm2\n"
788 "psubusb %%mm6,%%mm4\n"
793 /* now convert to 16-bit vectors so we can square them */
797 "punpckhbw %%mm0,%%mm2\n"
798 "punpckhbw %%mm0,%%mm4\n"
799 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
800 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
802 "pmaddwd %%mm2,%%mm2\n"
803 "pmaddwd %%mm4,%%mm4\n"
804 "pmaddwd %%mm1,%%mm1\n"
805 "pmaddwd %%mm3,%%mm3\n"
807 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
808 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
810 "paddd %%mm2,%%mm1\n"
811 "paddd %%mm4,%%mm3\n"
812 "paddd %%mm1,%%mm7\n"
813 "paddd %%mm3,%%mm7\n"
819 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
820 "paddd %%mm7,%%mm1\n"
822 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
823 : "r" ((long)line_size) , "m" (h)
828 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
832 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
833 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
835 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
836 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
837 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
838 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
840 /* todo: mm1-mm2, mm3-mm4 */
841 /* algo: substract mm1 from mm2 with saturation and vice versa */
842 /* OR the results to get absolute difference */
845 "psubusb %%mm2,%%mm1\n"
846 "psubusb %%mm4,%%mm3\n"
847 "psubusb %%mm5,%%mm2\n"
848 "psubusb %%mm6,%%mm4\n"
853 /* now convert to 16-bit vectors so we can square them */
857 "punpckhbw %%mm0,%%mm2\n"
858 "punpckhbw %%mm0,%%mm4\n"
859 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
860 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
862 "pmaddwd %%mm2,%%mm2\n"
863 "pmaddwd %%mm4,%%mm4\n"
864 "pmaddwd %%mm1,%%mm1\n"
865 "pmaddwd %%mm3,%%mm3\n"
870 "paddd %%mm2,%%mm1\n"
871 "paddd %%mm4,%%mm3\n"
872 "paddd %%mm1,%%mm7\n"
873 "paddd %%mm3,%%mm7\n"
879 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
880 "paddd %%mm7,%%mm1\n"
882 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
883 : "r" ((long)line_size) , "m" (h)
888 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
892 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
893 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
895 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
896 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
897 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
898 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
900 /* todo: mm1-mm2, mm3-mm4 */
901 /* algo: substract mm1 from mm2 with saturation and vice versa */
902 /* OR the results to get absolute difference */
903 "movdqa %%xmm1,%%xmm5\n"
904 "movdqa %%xmm3,%%xmm6\n"
905 "psubusb %%xmm2,%%xmm1\n"
906 "psubusb %%xmm4,%%xmm3\n"
907 "psubusb %%xmm5,%%xmm2\n"
908 "psubusb %%xmm6,%%xmm4\n"
910 "por %%xmm1,%%xmm2\n"
911 "por %%xmm3,%%xmm4\n"
913 /* now convert to 16-bit vectors so we can square them */
914 "movdqa %%xmm2,%%xmm1\n"
915 "movdqa %%xmm4,%%xmm3\n"
917 "punpckhbw %%xmm0,%%xmm2\n"
918 "punpckhbw %%xmm0,%%xmm4\n"
919 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
920 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
922 "pmaddwd %%xmm2,%%xmm2\n"
923 "pmaddwd %%xmm4,%%xmm4\n"
924 "pmaddwd %%xmm1,%%xmm1\n"
925 "pmaddwd %%xmm3,%%xmm3\n"
927 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
928 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
930 "paddd %%xmm2,%%xmm1\n"
931 "paddd %%xmm4,%%xmm3\n"
932 "paddd %%xmm1,%%xmm7\n"
933 "paddd %%xmm3,%%xmm7\n"
938 "movdqa %%xmm7,%%xmm1\n"
939 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
940 "paddd %%xmm1,%%xmm7\n"
941 "movdqa %%xmm7,%%xmm1\n"
942 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
943 "paddd %%xmm1,%%xmm7\n"
945 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
946 : "r" ((long)line_size));
950 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
958 "movq %%mm0, %%mm1\n"
962 "movq %%mm0, %%mm2\n"
963 "movq %%mm1, %%mm3\n"
964 "punpcklbw %%mm7,%%mm0\n"
965 "punpcklbw %%mm7,%%mm1\n"
966 "punpckhbw %%mm7,%%mm2\n"
967 "punpckhbw %%mm7,%%mm3\n"
968 "psubw %%mm1, %%mm0\n"
969 "psubw %%mm3, %%mm2\n"
974 "movq %%mm4, %%mm1\n"
978 "movq %%mm4, %%mm5\n"
979 "movq %%mm1, %%mm3\n"
980 "punpcklbw %%mm7,%%mm4\n"
981 "punpcklbw %%mm7,%%mm1\n"
982 "punpckhbw %%mm7,%%mm5\n"
983 "punpckhbw %%mm7,%%mm3\n"
984 "psubw %%mm1, %%mm4\n"
985 "psubw %%mm3, %%mm5\n"
986 "psubw %%mm4, %%mm0\n"
987 "psubw %%mm5, %%mm2\n"
988 "pxor %%mm3, %%mm3\n"
989 "pxor %%mm1, %%mm1\n"
990 "pcmpgtw %%mm0, %%mm3\n\t"
991 "pcmpgtw %%mm2, %%mm1\n\t"
992 "pxor %%mm3, %%mm0\n"
993 "pxor %%mm1, %%mm2\n"
994 "psubw %%mm3, %%mm0\n"
995 "psubw %%mm1, %%mm2\n"
996 "paddw %%mm0, %%mm2\n"
997 "paddw %%mm2, %%mm6\n"
1003 "movq %%mm0, %%mm1\n"
1007 "movq %%mm0, %%mm2\n"
1008 "movq %%mm1, %%mm3\n"
1009 "punpcklbw %%mm7,%%mm0\n"
1010 "punpcklbw %%mm7,%%mm1\n"
1011 "punpckhbw %%mm7,%%mm2\n"
1012 "punpckhbw %%mm7,%%mm3\n"
1013 "psubw %%mm1, %%mm0\n"
1014 "psubw %%mm3, %%mm2\n"
1015 "psubw %%mm0, %%mm4\n"
1016 "psubw %%mm2, %%mm5\n"
1017 "pxor %%mm3, %%mm3\n"
1018 "pxor %%mm1, %%mm1\n"
1019 "pcmpgtw %%mm4, %%mm3\n\t"
1020 "pcmpgtw %%mm5, %%mm1\n\t"
1021 "pxor %%mm3, %%mm4\n"
1022 "pxor %%mm1, %%mm5\n"
1023 "psubw %%mm3, %%mm4\n"
1024 "psubw %%mm1, %%mm5\n"
1025 "paddw %%mm4, %%mm5\n"
1026 "paddw %%mm5, %%mm6\n"
1031 "movq %%mm4, %%mm1\n"
1035 "movq %%mm4, %%mm5\n"
1036 "movq %%mm1, %%mm3\n"
1037 "punpcklbw %%mm7,%%mm4\n"
1038 "punpcklbw %%mm7,%%mm1\n"
1039 "punpckhbw %%mm7,%%mm5\n"
1040 "punpckhbw %%mm7,%%mm3\n"
1041 "psubw %%mm1, %%mm4\n"
1042 "psubw %%mm3, %%mm5\n"
1043 "psubw %%mm4, %%mm0\n"
1044 "psubw %%mm5, %%mm2\n"
1045 "pxor %%mm3, %%mm3\n"
1046 "pxor %%mm1, %%mm1\n"
1047 "pcmpgtw %%mm0, %%mm3\n\t"
1048 "pcmpgtw %%mm2, %%mm1\n\t"
1049 "pxor %%mm3, %%mm0\n"
1050 "pxor %%mm1, %%mm2\n"
1051 "psubw %%mm3, %%mm0\n"
1052 "psubw %%mm1, %%mm2\n"
1053 "paddw %%mm0, %%mm2\n"
1054 "paddw %%mm2, %%mm6\n"
1060 "movq %%mm6, %%mm0\n"
1061 "punpcklwd %%mm7,%%mm0\n"
1062 "punpckhwd %%mm7,%%mm6\n"
1063 "paddd %%mm0, %%mm6\n"
1065 "movq %%mm6,%%mm0\n"
1066 "psrlq $32, %%mm6\n"
1067 "paddd %%mm6,%%mm0\n"
1069 : "+r" (pix1), "=r"(tmp)
1070 : "r" ((long)line_size) , "g" (h-2)
1075 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1077 uint8_t * pix= pix1;
1080 "pxor %%mm7,%%mm7\n"
1081 "pxor %%mm6,%%mm6\n"
1084 "movq 1(%0),%%mm1\n"
1085 "movq %%mm0, %%mm2\n"
1086 "movq %%mm1, %%mm3\n"
1087 "punpcklbw %%mm7,%%mm0\n"
1088 "punpcklbw %%mm7,%%mm1\n"
1089 "punpckhbw %%mm7,%%mm2\n"
1090 "punpckhbw %%mm7,%%mm3\n"
1091 "psubw %%mm1, %%mm0\n"
1092 "psubw %%mm3, %%mm2\n"
1097 "movq 1(%0),%%mm1\n"
1098 "movq %%mm4, %%mm5\n"
1099 "movq %%mm1, %%mm3\n"
1100 "punpcklbw %%mm7,%%mm4\n"
1101 "punpcklbw %%mm7,%%mm1\n"
1102 "punpckhbw %%mm7,%%mm5\n"
1103 "punpckhbw %%mm7,%%mm3\n"
1104 "psubw %%mm1, %%mm4\n"
1105 "psubw %%mm3, %%mm5\n"
1106 "psubw %%mm4, %%mm0\n"
1107 "psubw %%mm5, %%mm2\n"
1108 "pxor %%mm3, %%mm3\n"
1109 "pxor %%mm1, %%mm1\n"
1110 "pcmpgtw %%mm0, %%mm3\n\t"
1111 "pcmpgtw %%mm2, %%mm1\n\t"
1112 "pxor %%mm3, %%mm0\n"
1113 "pxor %%mm1, %%mm2\n"
1114 "psubw %%mm3, %%mm0\n"
1115 "psubw %%mm1, %%mm2\n"
1116 "paddw %%mm0, %%mm2\n"
1117 "paddw %%mm2, %%mm6\n"
1123 "movq 1(%0),%%mm1\n"
1124 "movq %%mm0, %%mm2\n"
1125 "movq %%mm1, %%mm3\n"
1126 "punpcklbw %%mm7,%%mm0\n"
1127 "punpcklbw %%mm7,%%mm1\n"
1128 "punpckhbw %%mm7,%%mm2\n"
1129 "punpckhbw %%mm7,%%mm3\n"
1130 "psubw %%mm1, %%mm0\n"
1131 "psubw %%mm3, %%mm2\n"
1132 "psubw %%mm0, %%mm4\n"
1133 "psubw %%mm2, %%mm5\n"
1134 "pxor %%mm3, %%mm3\n"
1135 "pxor %%mm1, %%mm1\n"
1136 "pcmpgtw %%mm4, %%mm3\n\t"
1137 "pcmpgtw %%mm5, %%mm1\n\t"
1138 "pxor %%mm3, %%mm4\n"
1139 "pxor %%mm1, %%mm5\n"
1140 "psubw %%mm3, %%mm4\n"
1141 "psubw %%mm1, %%mm5\n"
1142 "paddw %%mm4, %%mm5\n"
1143 "paddw %%mm5, %%mm6\n"
1148 "movq 1(%0),%%mm1\n"
1149 "movq %%mm4, %%mm5\n"
1150 "movq %%mm1, %%mm3\n"
1151 "punpcklbw %%mm7,%%mm4\n"
1152 "punpcklbw %%mm7,%%mm1\n"
1153 "punpckhbw %%mm7,%%mm5\n"
1154 "punpckhbw %%mm7,%%mm3\n"
1155 "psubw %%mm1, %%mm4\n"
1156 "psubw %%mm3, %%mm5\n"
1157 "psubw %%mm4, %%mm0\n"
1158 "psubw %%mm5, %%mm2\n"
1159 "pxor %%mm3, %%mm3\n"
1160 "pxor %%mm1, %%mm1\n"
1161 "pcmpgtw %%mm0, %%mm3\n\t"
1162 "pcmpgtw %%mm2, %%mm1\n\t"
1163 "pxor %%mm3, %%mm0\n"
1164 "pxor %%mm1, %%mm2\n"
1165 "psubw %%mm3, %%mm0\n"
1166 "psubw %%mm1, %%mm2\n"
1167 "paddw %%mm0, %%mm2\n"
1168 "paddw %%mm2, %%mm6\n"
1174 "movq %%mm6, %%mm0\n"
1175 "punpcklwd %%mm7,%%mm0\n"
1176 "punpckhwd %%mm7,%%mm6\n"
1177 "paddd %%mm0, %%mm6\n"
1179 "movq %%mm6,%%mm0\n"
1180 "psrlq $32, %%mm6\n"
1181 "paddd %%mm6,%%mm0\n"
1183 : "+r" (pix1), "=r"(tmp)
1184 : "r" ((long)line_size) , "g" (h-2)
1186 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1189 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1190 MpegEncContext *c = p;
1193 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1194 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1195 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1197 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1198 else return score1 + FFABS(score2)*8;
1201 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1202 MpegEncContext *c = p;
1203 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1204 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1206 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1207 else return score1 + FFABS(score2)*8;
1210 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1213 assert( (((int)pix) & 7) == 0);
1214 assert((line_size &7) ==0);
1216 #define SUM(in0, in1, out0, out1) \
1217 "movq (%0), %%mm2\n"\
1218 "movq 8(%0), %%mm3\n"\
1220 "movq %%mm2, " #out0 "\n"\
1221 "movq %%mm3, " #out1 "\n"\
1222 "psubusb " #in0 ", %%mm2\n"\
1223 "psubusb " #in1 ", %%mm3\n"\
1224 "psubusb " #out0 ", " #in0 "\n"\
1225 "psubusb " #out1 ", " #in1 "\n"\
1226 "por %%mm2, " #in0 "\n"\
1227 "por %%mm3, " #in1 "\n"\
1228 "movq " #in0 ", %%mm2\n"\
1229 "movq " #in1 ", %%mm3\n"\
1230 "punpcklbw %%mm7, " #in0 "\n"\
1231 "punpcklbw %%mm7, " #in1 "\n"\
1232 "punpckhbw %%mm7, %%mm2\n"\
1233 "punpckhbw %%mm7, %%mm3\n"\
1234 "paddw " #in1 ", " #in0 "\n"\
1235 "paddw %%mm3, %%mm2\n"\
1236 "paddw %%mm2, " #in0 "\n"\
1237 "paddw " #in0 ", %%mm6\n"
1242 "pxor %%mm6,%%mm6\n"
1243 "pxor %%mm7,%%mm7\n"
1245 "movq 8(%0),%%mm1\n"
1248 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1251 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1253 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1258 "movq %%mm6,%%mm0\n"
1259 "psrlq $32, %%mm6\n"
1260 "paddw %%mm6,%%mm0\n"
1261 "movq %%mm0,%%mm6\n"
1262 "psrlq $16, %%mm0\n"
1263 "paddw %%mm6,%%mm0\n"
1265 : "+r" (pix), "=r"(tmp)
1266 : "r" ((long)line_size) , "m" (h)
1268 return tmp & 0xFFFF;
1272 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1275 assert( (((int)pix) & 7) == 0);
1276 assert((line_size &7) ==0);
1278 #define SUM(in0, in1, out0, out1) \
1279 "movq (%0), " #out0 "\n"\
1280 "movq 8(%0), " #out1 "\n"\
1282 "psadbw " #out0 ", " #in0 "\n"\
1283 "psadbw " #out1 ", " #in1 "\n"\
1284 "paddw " #in1 ", " #in0 "\n"\
1285 "paddw " #in0 ", %%mm6\n"
1289 "pxor %%mm6,%%mm6\n"
1290 "pxor %%mm7,%%mm7\n"
1292 "movq 8(%0),%%mm1\n"
1295 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1298 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1300 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1306 : "+r" (pix), "=r"(tmp)
1307 : "r" ((long)line_size) , "m" (h)
1313 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1316 assert( (((int)pix1) & 7) == 0);
1317 assert( (((int)pix2) & 7) == 0);
1318 assert((line_size &7) ==0);
1320 #define SUM(in0, in1, out0, out1) \
1321 "movq (%0),%%mm2\n"\
1322 "movq (%1)," #out0 "\n"\
1323 "movq 8(%0),%%mm3\n"\
1324 "movq 8(%1)," #out1 "\n"\
1327 "psubb " #out0 ", %%mm2\n"\
1328 "psubb " #out1 ", %%mm3\n"\
1329 "pxor %%mm7, %%mm2\n"\
1330 "pxor %%mm7, %%mm3\n"\
1331 "movq %%mm2, " #out0 "\n"\
1332 "movq %%mm3, " #out1 "\n"\
1333 "psubusb " #in0 ", %%mm2\n"\
1334 "psubusb " #in1 ", %%mm3\n"\
1335 "psubusb " #out0 ", " #in0 "\n"\
1336 "psubusb " #out1 ", " #in1 "\n"\
1337 "por %%mm2, " #in0 "\n"\
1338 "por %%mm3, " #in1 "\n"\
1339 "movq " #in0 ", %%mm2\n"\
1340 "movq " #in1 ", %%mm3\n"\
1341 "punpcklbw %%mm7, " #in0 "\n"\
1342 "punpcklbw %%mm7, " #in1 "\n"\
1343 "punpckhbw %%mm7, %%mm2\n"\
1344 "punpckhbw %%mm7, %%mm3\n"\
1345 "paddw " #in1 ", " #in0 "\n"\
1346 "paddw %%mm3, %%mm2\n"\
1347 "paddw %%mm2, " #in0 "\n"\
1348 "paddw " #in0 ", %%mm6\n"
1353 "pxor %%mm6,%%mm6\n"
1354 "pcmpeqw %%mm7,%%mm7\n"
1355 "psllw $15, %%mm7\n"
1356 "packsswb %%mm7, %%mm7\n"
1359 "movq 8(%0),%%mm1\n"
1360 "movq 8(%1),%%mm3\n"
1364 "psubb %%mm2, %%mm0\n"
1365 "psubb %%mm3, %%mm1\n"
1366 "pxor %%mm7, %%mm0\n"
1367 "pxor %%mm7, %%mm1\n"
1368 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1371 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1373 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1378 "movq %%mm6,%%mm0\n"
1379 "psrlq $32, %%mm6\n"
1380 "paddw %%mm6,%%mm0\n"
1381 "movq %%mm0,%%mm6\n"
1382 "psrlq $16, %%mm0\n"
1383 "paddw %%mm6,%%mm0\n"
1385 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1386 : "r" ((long)line_size) , "m" (h)
1388 return tmp & 0x7FFF;
1392 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1395 assert( (((int)pix1) & 7) == 0);
1396 assert( (((int)pix2) & 7) == 0);
1397 assert((line_size &7) ==0);
1399 #define SUM(in0, in1, out0, out1) \
1400 "movq (%0)," #out0 "\n"\
1401 "movq (%1),%%mm2\n"\
1402 "movq 8(%0)," #out1 "\n"\
1403 "movq 8(%1),%%mm3\n"\
1406 "psubb %%mm2, " #out0 "\n"\
1407 "psubb %%mm3, " #out1 "\n"\
1408 "pxor %%mm7, " #out0 "\n"\
1409 "pxor %%mm7, " #out1 "\n"\
1410 "psadbw " #out0 ", " #in0 "\n"\
1411 "psadbw " #out1 ", " #in1 "\n"\
1412 "paddw " #in1 ", " #in0 "\n"\
1413 "paddw " #in0 ", %%mm6\n"
1417 "pxor %%mm6,%%mm6\n"
1418 "pcmpeqw %%mm7,%%mm7\n"
1419 "psllw $15, %%mm7\n"
1420 "packsswb %%mm7, %%mm7\n"
1423 "movq 8(%0),%%mm1\n"
1424 "movq 8(%1),%%mm3\n"
1428 "psubb %%mm2, %%mm0\n"
1429 "psubb %%mm3, %%mm1\n"
1430 "pxor %%mm7, %%mm0\n"
1431 "pxor %%mm7, %%mm1\n"
1432 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1435 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1437 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1443 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1444 : "r" ((long)line_size) , "m" (h)
1450 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1454 "movq (%2, %0), %%mm0 \n\t"
1455 "movq (%1, %0), %%mm1 \n\t"
1456 "psubb %%mm0, %%mm1 \n\t"
1457 "movq %%mm1, (%3, %0) \n\t"
1458 "movq 8(%2, %0), %%mm0 \n\t"
1459 "movq 8(%1, %0), %%mm1 \n\t"
1460 "psubb %%mm0, %%mm1 \n\t"
1461 "movq %%mm1, 8(%3, %0) \n\t"
1466 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1469 dst[i+0] = src1[i+0]-src2[i+0];
1472 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1478 "movq -1(%1, %0), %%mm0 \n\t" // LT
1479 "movq (%1, %0), %%mm1 \n\t" // T
1480 "movq -1(%2, %0), %%mm2 \n\t" // L
1481 "movq (%2, %0), %%mm3 \n\t" // X
1482 "movq %%mm2, %%mm4 \n\t" // L
1483 "psubb %%mm0, %%mm2 \n\t"
1484 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1485 "movq %%mm4, %%mm5 \n\t" // L
1486 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1487 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1488 "pminub %%mm2, %%mm4 \n\t"
1489 "pmaxub %%mm1, %%mm4 \n\t"
1490 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1491 "movq %%mm3, (%3, %0) \n\t"
1496 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1502 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1504 *left_top= src1[w-1];
1508 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
1509 "mov"#m" "#p1", "#a" \n\t"\
1510 "mov"#m" "#p2", "#t" \n\t"\
1511 "punpcklbw "#a", "#t" \n\t"\
1512 "punpcklbw "#a", "#a" \n\t"\
1513 "psubw "#t", "#a" \n\t"\
1515 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1516 uint8_t *p1b=p1, *p2b=p2;\
1518 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1519 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1520 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1523 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1524 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1525 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1526 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1527 "mov"#m1" "#mm"0, %0 \n\t"\
1528 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1529 "mov"#m1" %0, "#mm"0 \n\t"\
1530 : "=m"(temp), "+r"(p1b), "+r"(p2b)\
1531 : "r"((long)stride), "r"((long)stride*3)\
1535 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
1536 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1539 // permutes 01234567 -> 05736421
1540 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1541 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
1542 SBUTTERFLY(c,d,b,wd,dqa)\
1543 SBUTTERFLY(e,f,d,wd,dqa)\
1544 SBUTTERFLY(g,h,f,wd,dqa)\
1545 SBUTTERFLY(a,c,h,dq,dqa)\
1546 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
1547 SBUTTERFLY(e,g,b,dq,dqa)\
1548 SBUTTERFLY(d,f,g,dq,dqa)\
1549 SBUTTERFLY(a,e,f,qdq,dqa)\
1550 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
1551 SBUTTERFLY(h,b,d,qdq,dqa)\
1552 SBUTTERFLY(c,g,b,qdq,dqa)\
1553 "movdqa %%xmm8, "#g" \n\t"
1555 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
1556 "movdqa "#h", "#t" \n\t"\
1557 SBUTTERFLY(a,b,h,wd,dqa)\
1558 "movdqa "#h", 16"#t" \n\t"\
1559 "movdqa "#t", "#h" \n\t"\
1560 SBUTTERFLY(c,d,b,wd,dqa)\
1561 SBUTTERFLY(e,f,d,wd,dqa)\
1562 SBUTTERFLY(g,h,f,wd,dqa)\
1563 SBUTTERFLY(a,c,h,dq,dqa)\
1564 "movdqa "#h", "#t" \n\t"\
1565 "movdqa 16"#t", "#h" \n\t"\
1566 SBUTTERFLY(h,b,c,dq,dqa)\
1567 SBUTTERFLY(e,g,b,dq,dqa)\
1568 SBUTTERFLY(d,f,g,dq,dqa)\
1569 SBUTTERFLY(a,e,f,qdq,dqa)\
1570 SBUTTERFLY(h,d,e,qdq,dqa)\
1571 "movdqa "#h", 16"#t" \n\t"\
1572 "movdqa "#t", "#h" \n\t"\
1573 SBUTTERFLY(h,b,d,qdq,dqa)\
1574 SBUTTERFLY(c,g,b,qdq,dqa)\
1575 "movdqa 16"#t", "#g" \n\t"
1578 #define LBUTTERFLY2(a1,b1,a2,b2)\
1579 "paddw " #b1 ", " #a1 " \n\t"\
1580 "paddw " #b2 ", " #a2 " \n\t"\
1581 "paddw " #b1 ", " #b1 " \n\t"\
1582 "paddw " #b2 ", " #b2 " \n\t"\
1583 "psubw " #a1 ", " #b1 " \n\t"\
1584 "psubw " #a2 ", " #b2 " \n\t"
1586 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1587 LBUTTERFLY2(m0, m1, m2, m3)\
1588 LBUTTERFLY2(m4, m5, m6, m7)\
1589 LBUTTERFLY2(m0, m2, m1, m3)\
1590 LBUTTERFLY2(m4, m6, m5, m7)\
1591 LBUTTERFLY2(m0, m4, m1, m5)\
1592 LBUTTERFLY2(m2, m6, m3, m7)\
1594 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1596 #define MMABS_MMX(a,z)\
1597 "pxor " #z ", " #z " \n\t"\
1598 "pcmpgtw " #a ", " #z " \n\t"\
1599 "pxor " #z ", " #a " \n\t"\
1600 "psubw " #z ", " #a " \n\t"
1602 #define MMABS_MMX2(a,z)\
1603 "pxor " #z ", " #z " \n\t"\
1604 "psubw " #a ", " #z " \n\t"\
1605 "pmaxsw " #z ", " #a " \n\t"
1607 #define MMABS_SSSE3(a,z)\
1608 "pabsw " #a ", " #a " \n\t"
1610 #define MMABS_SUM(a,z, sum)\
1612 "paddusw " #a ", " #sum " \n\t"
1614 #define MMABS_SUM_8x8_NOSPILL\
1615 MMABS(%%xmm0, %%xmm8)\
1616 MMABS(%%xmm1, %%xmm9)\
1617 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1618 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1619 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1620 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1621 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1622 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1623 "paddusw %%xmm1, %%xmm0 \n\t"
1626 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1628 #define MMABS_SUM_8x8_SSE2\
1629 "movdqa %%xmm7, (%1) \n\t"\
1630 MMABS(%%xmm0, %%xmm7)\
1631 MMABS(%%xmm1, %%xmm7)\
1632 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1633 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1634 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1635 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1636 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1637 "movdqa (%1), %%xmm2 \n\t"\
1638 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1639 "paddusw %%xmm1, %%xmm0 \n\t"
1642 #define LOAD4(o, a, b, c, d)\
1643 "movq "#o"(%1), "#a" \n\t"\
1644 "movq "#o"+8(%1), "#b" \n\t"\
1645 "movq "#o"+16(%1), "#c" \n\t"\
1646 "movq "#o"+24(%1), "#d" \n\t"\
1648 #define STORE4(o, a, b, c, d)\
1649 "movq "#a", "#o"(%1) \n\t"\
1650 "movq "#b", "#o"+8(%1) \n\t"\
1651 "movq "#c", "#o"+16(%1) \n\t"\
1652 "movq "#d", "#o"+24(%1) \n\t"\
1654 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1655 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1656 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1657 #define HSUM_MMX(a, t, dst)\
1658 "movq "#a", "#t" \n\t"\
1659 "psrlq $32, "#a" \n\t"\
1660 "paddusw "#t", "#a" \n\t"\
1661 "movq "#a", "#t" \n\t"\
1662 "psrlq $16, "#a" \n\t"\
1663 "paddusw "#t", "#a" \n\t"\
1664 "movd "#a", "#dst" \n\t"\
1666 #define HSUM_MMX2(a, t, dst)\
1667 "pshufw $0x0E, "#a", "#t" \n\t"\
1668 "paddusw "#t", "#a" \n\t"\
1669 "pshufw $0x01, "#a", "#t" \n\t"\
1670 "paddusw "#t", "#a" \n\t"\
1671 "movd "#a", "#dst" \n\t"\
1673 #define HSUM_SSE2(a, t, dst)\
1674 "movhlps "#a", "#t" \n\t"\
1675 "paddusw "#t", "#a" \n\t"\
1676 "pshuflw $0x0E, "#a", "#t" \n\t"\
1677 "paddusw "#t", "#a" \n\t"\
1678 "pshuflw $0x01, "#a", "#t" \n\t"\
1679 "paddusw "#t", "#a" \n\t"\
1680 "movd "#a", "#dst" \n\t"\
1682 #define HADAMARD8_DIFF_MMX(cpu) \
1683 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1684 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1689 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1694 "movq %%mm7, 96(%1) \n\t"\
1696 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1697 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1699 "movq 96(%1), %%mm7 \n\t"\
1700 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1701 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1707 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1712 "movq %%mm7, 96(%1) \n\t"\
1714 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1715 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1717 "movq 96(%1), %%mm7 \n\t"\
1718 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1719 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1720 "movq %%mm6, %%mm7 \n\t"\
1721 "movq %%mm0, %%mm6 \n\t"\
1723 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1726 "movq %%mm7, 64(%1) \n\t"\
1727 MMABS(%%mm0, %%mm7)\
1728 MMABS(%%mm1, %%mm7)\
1729 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1730 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1731 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1732 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1733 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1734 "movq 64(%1), %%mm2 \n\t"\
1735 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1736 "paddusw %%mm1, %%mm0 \n\t"\
1737 "movq %%mm0, 64(%1) \n\t"\
1739 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1740 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1743 "movq %%mm7, (%1) \n\t"\
1744 MMABS(%%mm0, %%mm7)\
1745 MMABS(%%mm1, %%mm7)\
1746 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1747 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1748 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1749 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1750 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1751 "movq (%1), %%mm2 \n\t"\
1752 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1753 "paddusw 64(%1), %%mm0 \n\t"\
1754 "paddusw %%mm1, %%mm0 \n\t"\
1756 HSUM(%%mm0, %%mm1, %0)\
1763 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1765 #define HADAMARD8_DIFF_SSE2(cpu) \
1766 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1767 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1772 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1775 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1776 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1777 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1779 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1785 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1787 #define MMABS(a,z) MMABS_MMX(a,z)
1788 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1789 HADAMARD8_DIFF_MMX(mmx)
1793 #define MMABS(a,z) MMABS_MMX2(a,z)
1794 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1795 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1796 HADAMARD8_DIFF_MMX(mmx2)
1797 HADAMARD8_DIFF_SSE2(sse2)
1799 #undef MMABS_SUM_8x8
1803 #define MMABS(a,z) MMABS_SSSE3(a,z)
1804 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1805 HADAMARD8_DIFF_SSE2(ssse3)
1807 #undef MMABS_SUM_8x8
1810 #define DCT_SAD4(m,mm,o)\
1811 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1812 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1813 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1814 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1815 MMABS_SUM(mm##2, mm##6, mm##0)\
1816 MMABS_SUM(mm##3, mm##7, mm##1)\
1817 MMABS_SUM(mm##4, mm##6, mm##0)\
1818 MMABS_SUM(mm##5, mm##7, mm##1)\
1820 #define DCT_SAD_MMX\
1821 "pxor %%mm0, %%mm0 \n\t"\
1822 "pxor %%mm1, %%mm1 \n\t"\
1823 DCT_SAD4(q, %%mm, 0)\
1824 DCT_SAD4(q, %%mm, 8)\
1825 DCT_SAD4(q, %%mm, 64)\
1826 DCT_SAD4(q, %%mm, 72)\
1827 "paddusw %%mm1, %%mm0 \n\t"\
1828 HSUM(%%mm0, %%mm1, %0)
1830 #define DCT_SAD_SSE2\
1831 "pxor %%xmm0, %%xmm0 \n\t"\
1832 "pxor %%xmm1, %%xmm1 \n\t"\
1833 DCT_SAD4(dqa, %%xmm, 0)\
1834 DCT_SAD4(dqa, %%xmm, 64)\
1835 "paddusw %%xmm1, %%xmm0 \n\t"\
1836 HSUM(%%xmm0, %%xmm1, %0)
1838 #define DCT_SAD_FUNC(cpu) \
1839 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1849 #define DCT_SAD DCT_SAD_MMX
1850 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1851 #define MMABS(a,z) MMABS_MMX(a,z)
1856 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1857 #define MMABS(a,z) MMABS_MMX2(a,z)
1862 #define DCT_SAD DCT_SAD_SSE2
1863 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1868 #define MMABS(a,z) MMABS_SSSE3(a,z)
1875 static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
1879 "pxor %%mm4, %%mm4 \n"
1882 "movq (%2,%0), %%mm2 \n"
1883 "movq (%3,%0,2), %%mm0 \n"
1884 "movq 8(%3,%0,2), %%mm1 \n"
1885 "punpckhbw %%mm2, %%mm3 \n"
1886 "punpcklbw %%mm2, %%mm2 \n"
1887 "psraw $8, %%mm3 \n"
1888 "psraw $8, %%mm2 \n"
1889 "psubw %%mm3, %%mm1 \n"
1890 "psubw %%mm2, %%mm0 \n"
1891 "pmaddwd %%mm1, %%mm1 \n"
1892 "pmaddwd %%mm0, %%mm0 \n"
1893 "paddd %%mm1, %%mm4 \n"
1894 "paddd %%mm0, %%mm4 \n"
1896 "movq %%mm4, %%mm3 \n"
1897 "psrlq $32, %%mm3 \n"
1898 "paddd %%mm3, %%mm4 \n"
1901 :"r"(pix1), "r"(pix2)
1906 #endif //CONFIG_ENCODERS
1908 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1909 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1911 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1912 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1913 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1914 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1915 "movq "#in7", " #m3 " \n\t" /* d */\
1916 "movq "#in0", %%mm5 \n\t" /* D */\
1917 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1918 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1919 "movq "#in1", %%mm5 \n\t" /* C */\
1920 "movq "#in2", %%mm6 \n\t" /* B */\
1921 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1922 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1923 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1924 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1925 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1926 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1927 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1928 "psraw $5, %%mm5 \n\t"\
1929 "packuswb %%mm5, %%mm5 \n\t"\
1930 OP(%%mm5, out, %%mm7, d)
1932 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1933 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1937 "pxor %%mm7, %%mm7 \n\t"\
1939 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1940 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1941 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1942 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1943 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1944 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1945 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1946 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1947 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1948 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1949 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1950 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1951 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1952 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1953 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1954 "paddw %%mm3, %%mm5 \n\t" /* b */\
1955 "paddw %%mm2, %%mm6 \n\t" /* c */\
1956 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1957 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1958 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1959 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1960 "paddw %%mm4, %%mm0 \n\t" /* a */\
1961 "paddw %%mm1, %%mm5 \n\t" /* d */\
1962 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1963 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1964 "paddw %6, %%mm6 \n\t"\
1965 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1966 "psraw $5, %%mm0 \n\t"\
1967 "movq %%mm0, %5 \n\t"\
1968 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1970 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1971 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1972 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1973 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1974 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1975 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1976 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1977 "paddw %%mm0, %%mm2 \n\t" /* b */\
1978 "paddw %%mm5, %%mm3 \n\t" /* c */\
1979 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1980 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1981 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1982 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1983 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1984 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1985 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1986 "paddw %%mm2, %%mm1 \n\t" /* a */\
1987 "paddw %%mm6, %%mm4 \n\t" /* d */\
1988 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1989 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1990 "paddw %6, %%mm1 \n\t"\
1991 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1992 "psraw $5, %%mm3 \n\t"\
1993 "movq %5, %%mm1 \n\t"\
1994 "packuswb %%mm3, %%mm1 \n\t"\
1995 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1996 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1998 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1999 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
2000 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
2001 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
2002 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
2003 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
2004 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
2005 "paddw %%mm1, %%mm5 \n\t" /* b */\
2006 "paddw %%mm4, %%mm0 \n\t" /* c */\
2007 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2008 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
2009 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
2010 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
2011 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
2012 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
2013 "paddw %%mm3, %%mm2 \n\t" /* d */\
2014 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
2015 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
2016 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
2017 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
2018 "paddw %%mm2, %%mm6 \n\t" /* a */\
2019 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
2020 "paddw %6, %%mm0 \n\t"\
2021 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2022 "psraw $5, %%mm0 \n\t"\
2023 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
2025 "paddw %%mm5, %%mm3 \n\t" /* a */\
2026 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
2027 "paddw %%mm4, %%mm6 \n\t" /* b */\
2028 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
2029 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
2030 "paddw %%mm1, %%mm4 \n\t" /* c */\
2031 "paddw %%mm2, %%mm5 \n\t" /* d */\
2032 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
2033 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
2034 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
2035 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
2036 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
2037 "paddw %6, %%mm4 \n\t"\
2038 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
2039 "psraw $5, %%mm4 \n\t"\
2040 "packuswb %%mm4, %%mm0 \n\t"\
2041 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
2047 : "+a"(src), "+c"(dst), "+m"(h)\
2048 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2053 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2056 /* quick HACK, XXX FIXME MUST be optimized */\
2059 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2060 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2061 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2062 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2063 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2064 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
2065 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
2066 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
2067 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
2068 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
2069 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
2070 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
2071 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
2072 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
2073 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
2074 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
2076 "movq (%0), %%mm0 \n\t"\
2077 "movq 8(%0), %%mm1 \n\t"\
2078 "paddw %2, %%mm0 \n\t"\
2079 "paddw %2, %%mm1 \n\t"\
2080 "psraw $5, %%mm0 \n\t"\
2081 "psraw $5, %%mm1 \n\t"\
2082 "packuswb %%mm1, %%mm0 \n\t"\
2083 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2084 "movq 16(%0), %%mm0 \n\t"\
2085 "movq 24(%0), %%mm1 \n\t"\
2086 "paddw %2, %%mm0 \n\t"\
2087 "paddw %2, %%mm1 \n\t"\
2088 "psraw $5, %%mm0 \n\t"\
2089 "psraw $5, %%mm1 \n\t"\
2090 "packuswb %%mm1, %%mm0 \n\t"\
2091 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
2092 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2100 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2104 "pxor %%mm7, %%mm7 \n\t"\
2106 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
2107 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
2108 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
2109 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
2110 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
2111 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
2112 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
2113 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
2114 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
2115 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
2116 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
2117 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
2118 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
2119 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
2120 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
2121 "paddw %%mm3, %%mm5 \n\t" /* b */\
2122 "paddw %%mm2, %%mm6 \n\t" /* c */\
2123 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
2124 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
2125 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
2126 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
2127 "paddw %%mm4, %%mm0 \n\t" /* a */\
2128 "paddw %%mm1, %%mm5 \n\t" /* d */\
2129 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
2130 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
2131 "paddw %6, %%mm6 \n\t"\
2132 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
2133 "psraw $5, %%mm0 \n\t"\
2134 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
2136 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
2137 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
2138 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
2139 "paddw %%mm5, %%mm1 \n\t" /* a */\
2140 "paddw %%mm6, %%mm2 \n\t" /* b */\
2141 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
2142 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
2143 "paddw %%mm6, %%mm3 \n\t" /* c */\
2144 "paddw %%mm5, %%mm4 \n\t" /* d */\
2145 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
2146 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
2147 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
2148 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
2149 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
2150 "paddw %6, %%mm1 \n\t"\
2151 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
2152 "psraw $5, %%mm3 \n\t"\
2153 "packuswb %%mm3, %%mm0 \n\t"\
2154 OP_MMX2(%%mm0, (%1), %%mm4, q)\
2160 : "+a"(src), "+c"(dst), "+m"(h)\
2161 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
2166 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
2169 /* quick HACK, XXX FIXME MUST be optimized */\
2172 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
2173 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
2174 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
2175 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
2176 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
2177 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
2178 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
2179 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
2181 "movq (%0), %%mm0 \n\t"\
2182 "movq 8(%0), %%mm1 \n\t"\
2183 "paddw %2, %%mm0 \n\t"\
2184 "paddw %2, %%mm1 \n\t"\
2185 "psraw $5, %%mm0 \n\t"\
2186 "psraw $5, %%mm1 \n\t"\
2187 "packuswb %%mm1, %%mm0 \n\t"\
2188 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2189 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2197 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
2199 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2200 uint64_t temp[17*4];\
2201 uint64_t *temp_ptr= temp;\
2206 "pxor %%mm7, %%mm7 \n\t"\
2208 "movq (%0), %%mm0 \n\t"\
2209 "movq (%0), %%mm1 \n\t"\
2210 "movq 8(%0), %%mm2 \n\t"\
2211 "movq 8(%0), %%mm3 \n\t"\
2212 "punpcklbw %%mm7, %%mm0 \n\t"\
2213 "punpckhbw %%mm7, %%mm1 \n\t"\
2214 "punpcklbw %%mm7, %%mm2 \n\t"\
2215 "punpckhbw %%mm7, %%mm3 \n\t"\
2216 "movq %%mm0, (%1) \n\t"\
2217 "movq %%mm1, 17*8(%1) \n\t"\
2218 "movq %%mm2, 2*17*8(%1) \n\t"\
2219 "movq %%mm3, 3*17*8(%1) \n\t"\
2224 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2225 : "r" ((long)srcStride)\
2232 /*FIXME reorder for speed */\
2234 /*"pxor %%mm7, %%mm7 \n\t"*/\
2236 "movq (%0), %%mm0 \n\t"\
2237 "movq 8(%0), %%mm1 \n\t"\
2238 "movq 16(%0), %%mm2 \n\t"\
2239 "movq 24(%0), %%mm3 \n\t"\
2240 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2241 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2243 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2245 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2247 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2248 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2250 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2251 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2253 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2254 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2256 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2257 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2259 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2261 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2263 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2264 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2266 "add $136, %0 \n\t"\
2271 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2272 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2277 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2278 uint64_t temp[9*2];\
2279 uint64_t *temp_ptr= temp;\
2284 "pxor %%mm7, %%mm7 \n\t"\
2286 "movq (%0), %%mm0 \n\t"\
2287 "movq (%0), %%mm1 \n\t"\
2288 "punpcklbw %%mm7, %%mm0 \n\t"\
2289 "punpckhbw %%mm7, %%mm1 \n\t"\
2290 "movq %%mm0, (%1) \n\t"\
2291 "movq %%mm1, 9*8(%1) \n\t"\
2296 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2297 : "r" ((long)srcStride)\
2304 /*FIXME reorder for speed */\
2306 /*"pxor %%mm7, %%mm7 \n\t"*/\
2308 "movq (%0), %%mm0 \n\t"\
2309 "movq 8(%0), %%mm1 \n\t"\
2310 "movq 16(%0), %%mm2 \n\t"\
2311 "movq 24(%0), %%mm3 \n\t"\
2312 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2313 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2315 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2317 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2319 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2321 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2323 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2324 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2331 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2332 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2337 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2338 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
2341 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2343 uint8_t * const half= (uint8_t*)temp;\
2344 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2345 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2348 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2349 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
2352 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2354 uint8_t * const half= (uint8_t*)temp;\
2355 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2356 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
2359 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2361 uint8_t * const half= (uint8_t*)temp;\
2362 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2363 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
2366 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2367 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
2370 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2372 uint8_t * const half= (uint8_t*)temp;\
2373 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2374 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
2376 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2377 uint64_t half[8 + 9];\
2378 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2379 uint8_t * const halfHV= ((uint8_t*)half);\
2380 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2381 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2382 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2383 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2385 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2386 uint64_t half[8 + 9];\
2387 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2388 uint8_t * const halfHV= ((uint8_t*)half);\
2389 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2390 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2391 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2392 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2394 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2395 uint64_t half[8 + 9];\
2396 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2397 uint8_t * const halfHV= ((uint8_t*)half);\
2398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2399 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2400 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2401 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2403 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2404 uint64_t half[8 + 9];\
2405 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2406 uint8_t * const halfHV= ((uint8_t*)half);\
2407 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2408 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2409 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2410 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2412 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2413 uint64_t half[8 + 9];\
2414 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2415 uint8_t * const halfHV= ((uint8_t*)half);\
2416 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2417 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2418 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
2420 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2421 uint64_t half[8 + 9];\
2422 uint8_t * const halfH= ((uint8_t*)half) + 64;\
2423 uint8_t * const halfHV= ((uint8_t*)half);\
2424 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
2428 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2429 uint64_t half[8 + 9];\
2430 uint8_t * const halfH= ((uint8_t*)half);\
2431 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2432 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
2433 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2435 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2436 uint64_t half[8 + 9];\
2437 uint8_t * const halfH= ((uint8_t*)half);\
2438 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2439 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
2440 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2442 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2444 uint8_t * const halfH= ((uint8_t*)half);\
2445 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2446 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
2448 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2449 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
2452 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2454 uint8_t * const half= (uint8_t*)temp;\
2455 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2456 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2459 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2460 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
2463 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2465 uint8_t * const half= (uint8_t*)temp;\
2466 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2467 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
2470 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2472 uint8_t * const half= (uint8_t*)temp;\
2473 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2474 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
2477 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2478 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
2481 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2483 uint8_t * const half= (uint8_t*)temp;\
2484 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2485 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
2487 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2488 uint64_t half[16*2 + 17*2];\
2489 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2490 uint8_t * const halfHV= ((uint8_t*)half);\
2491 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2492 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2493 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2494 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2496 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2497 uint64_t half[16*2 + 17*2];\
2498 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2499 uint8_t * const halfHV= ((uint8_t*)half);\
2500 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2501 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2502 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2503 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2505 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2506 uint64_t half[16*2 + 17*2];\
2507 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2508 uint8_t * const halfHV= ((uint8_t*)half);\
2509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2510 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2511 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2512 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2514 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2515 uint64_t half[16*2 + 17*2];\
2516 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2517 uint8_t * const halfHV= ((uint8_t*)half);\
2518 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2519 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2520 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2521 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2523 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2524 uint64_t half[16*2 + 17*2];\
2525 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2526 uint8_t * const halfHV= ((uint8_t*)half);\
2527 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2528 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2529 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
2531 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2532 uint64_t half[16*2 + 17*2];\
2533 uint8_t * const halfH= ((uint8_t*)half) + 256;\
2534 uint8_t * const halfHV= ((uint8_t*)half);\
2535 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2536 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2537 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
2539 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2540 uint64_t half[17*2];\
2541 uint8_t * const halfH= ((uint8_t*)half);\
2542 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2543 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
2544 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2546 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2547 uint64_t half[17*2];\
2548 uint8_t * const halfH= ((uint8_t*)half);\
2549 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2550 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
2551 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2553 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2554 uint64_t half[17*2];\
2555 uint8_t * const halfH= ((uint8_t*)half);\
2556 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2557 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2560 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2561 #define AVG_3DNOW_OP(a,b,temp, size) \
2562 "mov" #size " " #b ", " #temp " \n\t"\
2563 "pavgusb " #temp ", " #a " \n\t"\
2564 "mov" #size " " #a ", " #b " \n\t"
2565 #define AVG_MMX2_OP(a,b,temp, size) \
2566 "mov" #size " " #b ", " #temp " \n\t"\
2567 "pavgb " #temp ", " #a " \n\t"\
2568 "mov" #size " " #a ", " #b " \n\t"
2570 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2571 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2572 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2573 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2574 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2575 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2576 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2577 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2578 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2580 /***********************************/
2581 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
2583 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
2584 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2585 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
2587 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
2588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2589 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
2592 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
2593 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
2594 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
2595 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
2596 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
2597 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
2598 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
2599 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
2600 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
2601 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
2602 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2603 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
2605 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2606 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
2608 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
2609 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
2610 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
2611 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
2612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
2613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
2614 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
2615 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
2617 QPEL_2TAP(put_, 16, mmx2)
2618 QPEL_2TAP(avg_, 16, mmx2)
2619 QPEL_2TAP(put_, 8, mmx2)
2620 QPEL_2TAP(avg_, 8, mmx2)
2621 QPEL_2TAP(put_, 16, 3dnow)
2622 QPEL_2TAP(avg_, 16, 3dnow)
2623 QPEL_2TAP(put_, 8, 3dnow)
2624 QPEL_2TAP(avg_, 8, 3dnow)
2628 static void just_return() { return; }
2631 #define SET_QPEL_FUNC(postfix1, postfix2) \
2632 c->put_ ## postfix1 = put_ ## postfix2;\
2633 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2634 c->avg_ ## postfix1 = avg_ ## postfix2;
2636 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
2637 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
2639 const int ix = ox>>(16+shift);
2640 const int iy = oy>>(16+shift);
2641 const int oxs = ox>>4;
2642 const int oys = oy>>4;
2643 const int dxxs = dxx>>4;
2644 const int dxys = dxy>>4;
2645 const int dyxs = dyx>>4;
2646 const int dyys = dyy>>4;
2647 const uint16_t r4[4] = {r,r,r,r};
2648 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
2649 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
2650 const uint64_t shift2 = 2*shift;
2651 uint8_t edge_buf[(h+1)*stride];
2654 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
2655 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
2656 const int dxh = dxy*(h-1);
2657 const int dyw = dyx*(w-1);
2658 if( // non-constant fullpel offset (3% of blocks)
2659 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
2660 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
2661 // uses more than 16 bits of subpel mv (only at huge resolution)
2662 || (dxx|dxy|dyx|dyy)&15 )
2664 //FIXME could still use mmx for some of the rows
2665 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
2669 src += ix + iy*stride;
2670 if( (unsigned)ix >= width-w ||
2671 (unsigned)iy >= height-h )
2673 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
2678 "movd %0, %%mm6 \n\t"
2679 "pxor %%mm7, %%mm7 \n\t"
2680 "punpcklwd %%mm6, %%mm6 \n\t"
2681 "punpcklwd %%mm6, %%mm6 \n\t"
2685 for(x=0; x<w; x+=4){
2686 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
2687 oxs - dxys + dxxs*(x+1),
2688 oxs - dxys + dxxs*(x+2),
2689 oxs - dxys + dxxs*(x+3) };
2690 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
2691 oys - dyys + dyxs*(x+1),
2692 oys - dyys + dyxs*(x+2),
2693 oys - dyys + dyxs*(x+3) };
2697 "movq %0, %%mm4 \n\t"
2698 "movq %1, %%mm5 \n\t"
2699 "paddw %2, %%mm4 \n\t"
2700 "paddw %3, %%mm5 \n\t"
2701 "movq %%mm4, %0 \n\t"
2702 "movq %%mm5, %1 \n\t"
2703 "psrlw $12, %%mm4 \n\t"
2704 "psrlw $12, %%mm5 \n\t"
2705 : "+m"(*dx4), "+m"(*dy4)
2706 : "m"(*dxy4), "m"(*dyy4)
2710 "movq %%mm6, %%mm2 \n\t"
2711 "movq %%mm6, %%mm1 \n\t"
2712 "psubw %%mm4, %%mm2 \n\t"
2713 "psubw %%mm5, %%mm1 \n\t"
2714 "movq %%mm2, %%mm0 \n\t"
2715 "movq %%mm4, %%mm3 \n\t"
2716 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
2717 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
2718 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
2719 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
2721 "movd %4, %%mm5 \n\t"
2722 "movd %3, %%mm4 \n\t"
2723 "punpcklbw %%mm7, %%mm5 \n\t"
2724 "punpcklbw %%mm7, %%mm4 \n\t"
2725 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
2726 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
2728 "movd %2, %%mm5 \n\t"
2729 "movd %1, %%mm4 \n\t"
2730 "punpcklbw %%mm7, %%mm5 \n\t"
2731 "punpcklbw %%mm7, %%mm4 \n\t"
2732 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
2733 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
2734 "paddw %5, %%mm1 \n\t"
2735 "paddw %%mm3, %%mm2 \n\t"
2736 "paddw %%mm1, %%mm0 \n\t"
2737 "paddw %%mm2, %%mm0 \n\t"
2739 "psrlw %6, %%mm0 \n\t"
2740 "packuswb %%mm0, %%mm0 \n\t"
2741 "movd %%mm0, %0 \n\t"
2743 : "=m"(dst[x+y*stride])
2744 : "m"(src[0]), "m"(src[1]),
2745 "m"(src[stride]), "m"(src[stride+1]),
2746 "m"(*r4), "m"(shift2)
2754 #ifdef CONFIG_ENCODERS
2755 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2758 assert(FFABS(scale) < 256);
2759 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2762 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2763 "psrlw $15, %%mm6 \n\t" // 1w
2764 "pxor %%mm7, %%mm7 \n\t"
2765 "movd %4, %%mm5 \n\t"
2766 "punpcklwd %%mm5, %%mm5 \n\t"
2767 "punpcklwd %%mm5, %%mm5 \n\t"
2769 "movq (%1, %0), %%mm0 \n\t"
2770 "movq 8(%1, %0), %%mm1 \n\t"
2771 "pmulhw %%mm5, %%mm0 \n\t"
2772 "pmulhw %%mm5, %%mm1 \n\t"
2773 "paddw %%mm6, %%mm0 \n\t"
2774 "paddw %%mm6, %%mm1 \n\t"
2775 "psraw $1, %%mm0 \n\t"
2776 "psraw $1, %%mm1 \n\t"
2777 "paddw (%2, %0), %%mm0 \n\t"
2778 "paddw 8(%2, %0), %%mm1 \n\t"
2779 "psraw $6, %%mm0 \n\t"
2780 "psraw $6, %%mm1 \n\t"
2781 "pmullw (%3, %0), %%mm0 \n\t"
2782 "pmullw 8(%3, %0), %%mm1 \n\t"
2783 "pmaddwd %%mm0, %%mm0 \n\t"
2784 "pmaddwd %%mm1, %%mm1 \n\t"
2785 "paddd %%mm1, %%mm0 \n\t"
2786 "psrld $4, %%mm0 \n\t"
2787 "paddd %%mm0, %%mm7 \n\t"
2789 "cmp $128, %0 \n\t" //FIXME optimize & bench
2791 "movq %%mm7, %%mm6 \n\t"
2792 "psrlq $32, %%mm7 \n\t"
2793 "paddd %%mm6, %%mm7 \n\t"
2794 "psrld $2, %%mm7 \n\t"
2795 "movd %%mm7, %0 \n\t"
2798 : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
2803 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
2806 if(FFABS(scale) < 256){
2807 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2809 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2810 "psrlw $15, %%mm6 \n\t" // 1w
2811 "movd %3, %%mm5 \n\t"
2812 "punpcklwd %%mm5, %%mm5 \n\t"
2813 "punpcklwd %%mm5, %%mm5 \n\t"
2815 "movq (%1, %0), %%mm0 \n\t"
2816 "movq 8(%1, %0), %%mm1 \n\t"
2817 "pmulhw %%mm5, %%mm0 \n\t"
2818 "pmulhw %%mm5, %%mm1 \n\t"
2819 "paddw %%mm6, %%mm0 \n\t"
2820 "paddw %%mm6, %%mm1 \n\t"
2821 "psraw $1, %%mm0 \n\t"
2822 "psraw $1, %%mm1 \n\t"
2823 "paddw (%2, %0), %%mm0 \n\t"
2824 "paddw 8(%2, %0), %%mm1 \n\t"
2825 "movq %%mm0, (%2, %0) \n\t"
2826 "movq %%mm1, 8(%2, %0) \n\t"
2828 "cmp $128, %0 \n\t" //FIXME optimize & bench
2832 : "r"(basis), "r"(rem), "g"(scale)
2835 for(i=0; i<8*8; i++){
2836 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2840 #endif /* CONFIG_ENCODERS */
2842 #define PREFETCH(name, op) \
2843 static void name(void *mem, int stride, int h){\
2844 const uint8_t *p= mem;\
2846 asm volatile(#op" %0" :: "m"(*p));\
2850 PREFETCH(prefetch_mmx2, prefetcht0)
2851 PREFETCH(prefetch_3dnow, prefetch)
2854 #include "h264dsp_mmx.c"
2857 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
2859 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2860 put_pixels8_mmx(dst, src, stride, 8);
2862 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2863 avg_pixels8_mmx(dst, src, stride, 8);
2865 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2866 put_pixels16_mmx(dst, src, stride, 16);
2868 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
2869 avg_pixels16_mmx(dst, src, stride, 16);
2872 /* external functions, from idct_mmx.c */
2873 void ff_mmx_idct(DCTELEM *block);
2874 void ff_mmxext_idct(DCTELEM *block);
2876 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2879 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2881 ff_mmx_idct (block);
2882 put_pixels_clamped_mmx(block, dest, line_size);
2884 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2886 ff_mmx_idct (block);
2887 add_pixels_clamped_mmx(block, dest, line_size);
2889 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2891 ff_mmxext_idct (block);
2892 put_pixels_clamped_mmx(block, dest, line_size);
2894 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2896 ff_mmxext_idct (block);
2897 add_pixels_clamped_mmx(block, dest, line_size);
2900 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2902 ff_idct_xvid_mmx (block);
2903 put_pixels_clamped_mmx(block, dest, line_size);
2905 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2907 ff_idct_xvid_mmx (block);
2908 add_pixels_clamped_mmx(block, dest, line_size);
2910 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2912 ff_idct_xvid_mmx2 (block);
2913 put_pixels_clamped_mmx(block, dest, line_size);
2915 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2917 ff_idct_xvid_mmx2 (block);
2918 add_pixels_clamped_mmx(block, dest, line_size);
2921 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2924 asm volatile("pxor %%mm7, %%mm7":);
2925 for(i=0; i<blocksize; i+=2) {
2927 "movq %0, %%mm0 \n\t"
2928 "movq %1, %%mm1 \n\t"
2929 "movq %%mm0, %%mm2 \n\t"
2930 "movq %%mm1, %%mm3 \n\t"
2931 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2932 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2933 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2934 "pxor %%mm2, %%mm1 \n\t"
2935 "movq %%mm3, %%mm4 \n\t"
2936 "pand %%mm1, %%mm3 \n\t"
2937 "pandn %%mm1, %%mm4 \n\t"
2938 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2939 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2940 "movq %%mm3, %1 \n\t"
2941 "movq %%mm0, %0 \n\t"
2942 :"+m"(mag[i]), "+m"(ang[i])
2946 asm volatile("femms");
2948 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2953 "movaps %0, %%xmm5 \n\t"
2954 ::"m"(ff_pdw_80000000[0])
2956 for(i=0; i<blocksize; i+=4) {
2958 "movaps %0, %%xmm0 \n\t"
2959 "movaps %1, %%xmm1 \n\t"
2960 "xorps %%xmm2, %%xmm2 \n\t"
2961 "xorps %%xmm3, %%xmm3 \n\t"
2962 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2963 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2964 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2965 "xorps %%xmm2, %%xmm1 \n\t"
2966 "movaps %%xmm3, %%xmm4 \n\t"
2967 "andps %%xmm1, %%xmm3 \n\t"
2968 "andnps %%xmm1, %%xmm4 \n\t"
2969 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2970 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2971 "movaps %%xmm3, %1 \n\t"
2972 "movaps %%xmm0, %0 \n\t"
2973 :"+m"(mag[i]), "+m"(ang[i])
2979 static void vector_fmul_3dnow(float *dst, const float *src, int len){
2983 "movq (%1,%0), %%mm0 \n\t"
2984 "movq 8(%1,%0), %%mm1 \n\t"
2985 "pfmul (%2,%0), %%mm0 \n\t"
2986 "pfmul 8(%2,%0), %%mm1 \n\t"
2987 "movq %%mm0, (%1,%0) \n\t"
2988 "movq %%mm1, 8(%1,%0) \n\t"
2997 static void vector_fmul_sse(float *dst, const float *src, int len){
3001 "movaps (%1,%0), %%xmm0 \n\t"
3002 "movaps 16(%1,%0), %%xmm1 \n\t"
3003 "mulps (%2,%0), %%xmm0 \n\t"
3004 "mulps 16(%2,%0), %%xmm1 \n\t"
3005 "movaps %%xmm0, (%1,%0) \n\t"
3006 "movaps %%xmm1, 16(%1,%0) \n\t"
3015 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
3019 "pswapd 8(%1), %%mm0 \n\t"
3020 "pswapd (%1), %%mm1 \n\t"
3021 "pfmul (%3,%0), %%mm0 \n\t"
3022 "pfmul 8(%3,%0), %%mm1 \n\t"
3023 "movq %%mm0, (%2,%0) \n\t"
3024 "movq %%mm1, 8(%2,%0) \n\t"
3028 :"+r"(i), "+r"(src1)
3029 :"r"(dst), "r"(src0)
3031 asm volatile("femms");
3033 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3037 "movaps 16(%1), %%xmm0 \n\t"
3038 "movaps (%1), %%xmm1 \n\t"
3039 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
3040 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3041 "mulps (%3,%0), %%xmm0 \n\t"
3042 "mulps 16(%3,%0), %%xmm1 \n\t"
3043 "movaps %%xmm0, (%2,%0) \n\t"
3044 "movaps %%xmm1, 16(%2,%0) \n\t"
3048 :"+r"(i), "+r"(src1)
3049 :"r"(dst), "r"(src0)
3053 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
3054 const float *src2, int src3, int len, int step){
3056 if(step == 2 && src3 == 0){
3060 "movq (%2,%0), %%mm0 \n\t"
3061 "movq 8(%2,%0), %%mm1 \n\t"
3062 "pfmul (%3,%0), %%mm0 \n\t"
3063 "pfmul 8(%3,%0), %%mm1 \n\t"
3064 "pfadd (%4,%0), %%mm0 \n\t"
3065 "pfadd 8(%4,%0), %%mm1 \n\t"
3066 "movd %%mm0, (%1) \n\t"
3067 "movd %%mm1, 16(%1) \n\t"
3068 "psrlq $32, %%mm0 \n\t"
3069 "psrlq $32, %%mm1 \n\t"
3070 "movd %%mm0, 8(%1) \n\t"
3071 "movd %%mm1, 24(%1) \n\t"
3076 :"r"(src0), "r"(src1), "r"(src2)
3080 else if(step == 1 && src3 == 0){
3083 "movq (%2,%0), %%mm0 \n\t"
3084 "movq 8(%2,%0), %%mm1 \n\t"
3085 "pfmul (%3,%0), %%mm0 \n\t"
3086 "pfmul 8(%3,%0), %%mm1 \n\t"
3087 "pfadd (%4,%0), %%mm0 \n\t"
3088 "pfadd 8(%4,%0), %%mm1 \n\t"
3089 "movq %%mm0, (%1,%0) \n\t"
3090 "movq %%mm1, 8(%1,%0) \n\t"
3094 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3099 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3100 asm volatile("femms");
3102 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3103 const float *src2, int src3, int len, int step){
3105 if(step == 2 && src3 == 0){
3109 "movaps (%2,%0), %%xmm0 \n\t"
3110 "movaps 16(%2,%0), %%xmm1 \n\t"
3111 "mulps (%3,%0), %%xmm0 \n\t"
3112 "mulps 16(%3,%0), %%xmm1 \n\t"
3113 "addps (%4,%0), %%xmm0 \n\t"
3114 "addps 16(%4,%0), %%xmm1 \n\t"
3115 "movss %%xmm0, (%1) \n\t"
3116 "movss %%xmm1, 32(%1) \n\t"
3117 "movhlps %%xmm0, %%xmm2 \n\t"
3118 "movhlps %%xmm1, %%xmm3 \n\t"
3119 "movss %%xmm2, 16(%1) \n\t"
3120 "movss %%xmm3, 48(%1) \n\t"
3121 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
3122 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
3123 "movss %%xmm0, 8(%1) \n\t"
3124 "movss %%xmm1, 40(%1) \n\t"
3125 "movhlps %%xmm0, %%xmm2 \n\t"
3126 "movhlps %%xmm1, %%xmm3 \n\t"
3127 "movss %%xmm2, 24(%1) \n\t"
3128 "movss %%xmm3, 56(%1) \n\t"
3133 :"r"(src0), "r"(src1), "r"(src2)
3137 else if(step == 1 && src3 == 0){
3140 "movaps (%2,%0), %%xmm0 \n\t"
3141 "movaps 16(%2,%0), %%xmm1 \n\t"
3142 "mulps (%3,%0), %%xmm0 \n\t"
3143 "mulps 16(%3,%0), %%xmm1 \n\t"
3144 "addps (%4,%0), %%xmm0 \n\t"
3145 "addps 16(%4,%0), %%xmm1 \n\t"
3146 "movaps %%xmm0, (%1,%0) \n\t"
3147 "movaps %%xmm1, 16(%1,%0) \n\t"
3151 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
3156 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
3159 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3160 // not bit-exact: pf2id uses different rounding than C and SSE
3162 for(i=0; i<len; i+=4) {
3164 "pf2id %1, %%mm0 \n\t"
3165 "pf2id %2, %%mm1 \n\t"
3166 "packssdw %%mm1, %%mm0 \n\t"
3167 "movq %%mm0, %0 \n\t"
3169 :"m"(src[i]), "m"(src[i+2])
3172 asm volatile("femms");
3174 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3176 for(i=0; i<len; i+=4) {
3178 "cvtps2pi %1, %%mm0 \n\t"
3179 "cvtps2pi %2, %%mm1 \n\t"
3180 "packssdw %%mm1, %%mm0 \n\t"
3181 "movq %%mm0, %0 \n\t"
3183 :"m"(src[i]), "m"(src[i+2])
3186 asm volatile("emms");
3189 #ifdef CONFIG_SNOW_DECODER
3190 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
3191 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3192 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3193 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
3194 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3195 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3196 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3197 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3200 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3202 mm_flags = mm_support();
3204 if (avctx->dsp_mask) {
3205 if (avctx->dsp_mask & FF_MM_FORCE)
3206 mm_flags |= (avctx->dsp_mask & 0xffff);
3208 mm_flags &= ~(avctx->dsp_mask & 0xffff);
3212 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3213 if (mm_flags & MM_MMX)
3214 av_log(avctx, AV_LOG_INFO, " mmx");
3215 if (mm_flags & MM_MMXEXT)
3216 av_log(avctx, AV_LOG_INFO, " mmxext");
3217 if (mm_flags & MM_3DNOW)
3218 av_log(avctx, AV_LOG_INFO, " 3dnow");
3219 if (mm_flags & MM_SSE)
3220 av_log(avctx, AV_LOG_INFO, " sse");
3221 if (mm_flags & MM_SSE2)
3222 av_log(avctx, AV_LOG_INFO, " sse2");
3223 av_log(avctx, AV_LOG_INFO, "\n");
3226 if (mm_flags & MM_MMX) {
3227 const int idct_algo= avctx->idct_algo;
3229 #ifdef CONFIG_ENCODERS
3230 const int dct_algo = avctx->dct_algo;
3231 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3232 if(mm_flags & MM_SSE2){
3233 c->fdct = ff_fdct_sse2;
3234 }else if(mm_flags & MM_MMXEXT){
3235 c->fdct = ff_fdct_mmx2;
3237 c->fdct = ff_fdct_mmx;
3240 #endif //CONFIG_ENCODERS
3241 if(avctx->lowres==0){
3242 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3243 c->idct_put= ff_simple_idct_put_mmx;
3244 c->idct_add= ff_simple_idct_add_mmx;
3245 c->idct = ff_simple_idct_mmx;
3246 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3248 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
3249 if(mm_flags & MM_MMXEXT){
3250 c->idct_put= ff_libmpeg2mmx2_idct_put;
3251 c->idct_add= ff_libmpeg2mmx2_idct_add;
3252 c->idct = ff_mmxext_idct;
3254 c->idct_put= ff_libmpeg2mmx_idct_put;
3255 c->idct_add= ff_libmpeg2mmx_idct_add;
3256 c->idct = ff_mmx_idct;
3258 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3260 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
3261 idct_algo==FF_IDCT_VP3 &&
3262 avctx->codec->id!=CODEC_ID_THEORA &&
3263 !(avctx->flags & CODEC_FLAG_BITEXACT)){
3264 if(mm_flags & MM_SSE2){
3265 c->idct_put= ff_vp3_idct_put_sse2;
3266 c->idct_add= ff_vp3_idct_add_sse2;
3267 c->idct = ff_vp3_idct_sse2;
3268 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3270 ff_vp3_dsp_init_mmx();
3271 c->idct_put= ff_vp3_idct_put_mmx;
3272 c->idct_add= ff_vp3_idct_add_mmx;
3273 c->idct = ff_vp3_idct_mmx;
3274 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
3276 }else if(idct_algo==FF_IDCT_CAVS){
3277 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
3278 }else if(idct_algo==FF_IDCT_XVIDMMX){
3279 if(mm_flags & MM_MMXEXT){
3280 c->idct_put= ff_idct_xvid_mmx2_put;
3281 c->idct_add= ff_idct_xvid_mmx2_add;
3282 c->idct = ff_idct_xvid_mmx2;
3284 c->idct_put= ff_idct_xvid_mmx_put;
3285 c->idct_add= ff_idct_xvid_mmx_add;
3286 c->idct = ff_idct_xvid_mmx;
3291 #ifdef CONFIG_ENCODERS
3292 c->get_pixels = get_pixels_mmx;
3293 c->diff_pixels = diff_pixels_mmx;
3294 #endif //CONFIG_ENCODERS
3295 c->put_pixels_clamped = put_pixels_clamped_mmx;
3296 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3297 c->add_pixels_clamped = add_pixels_clamped_mmx;
3298 c->clear_blocks = clear_blocks_mmx;
3299 #ifdef CONFIG_ENCODERS
3300 c->pix_sum = pix_sum16_mmx;
3301 #endif //CONFIG_ENCODERS
3303 c->put_pixels_tab[0][0] = put_pixels16_mmx;
3304 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
3305 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
3306 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
3308 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
3309 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
3310 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
3311 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
3313 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
3314 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
3315 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
3316 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
3318 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
3319 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
3320 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
3321 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
3323 c->put_pixels_tab[1][0] = put_pixels8_mmx;
3324 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
3325 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
3326 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
3328 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
3329 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
3330 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
3331 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
3333 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
3334 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
3335 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
3336 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
3338 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
3339 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
3340 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
3341 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
3345 c->add_bytes= add_bytes_mmx;
3346 #ifdef CONFIG_ENCODERS
3347 c->diff_bytes= diff_bytes_mmx;
3348 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
3350 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
3351 c->hadamard8_diff[1]= hadamard8_diff_mmx;
3353 c->pix_norm1 = pix_norm1_mmx;
3354 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
3355 c->sse[1] = sse8_mmx;
3356 c->vsad[4]= vsad_intra16_mmx;
3358 c->nsse[0] = nsse16_mmx;
3359 c->nsse[1] = nsse8_mmx;
3360 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3361 c->vsad[0] = vsad16_mmx;
3364 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3365 c->try_8x8basis= try_8x8basis_mmx;
3367 c->add_8x8basis= add_8x8basis_mmx;
3369 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
3371 #endif //CONFIG_ENCODERS
3373 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
3374 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
3375 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
3376 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
3378 c->h264_idct_dc_add=
3379 c->h264_idct_add= ff_h264_idct_add_mmx;
3380 c->h264_idct8_dc_add=
3381 c->h264_idct8_add= ff_h264_idct8_add_mmx;
3383 if (mm_flags & MM_MMXEXT) {
3384 c->prefetch = prefetch_mmx2;
3386 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
3387 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
3389 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
3390 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
3391 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
3393 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
3394 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
3396 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
3397 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
3398 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
3400 #ifdef CONFIG_ENCODERS
3401 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
3402 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
3403 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
3404 c->vsad[4]= vsad_intra16_mmx2;
3405 #endif //CONFIG_ENCODERS
3407 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
3408 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
3410 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3411 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
3412 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
3413 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
3414 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
3415 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
3416 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3417 #ifdef CONFIG_ENCODERS
3418 c->vsad[0] = vsad16_mmx2;
3419 #endif //CONFIG_ENCODERS
3423 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
3424 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
3425 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
3426 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
3427 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
3428 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
3429 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
3430 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
3431 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
3432 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
3433 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
3434 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
3435 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
3436 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
3437 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
3438 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
3439 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
3440 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
3441 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
3442 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
3443 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
3444 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
3445 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
3446 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
3447 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
3448 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
3449 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
3450 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
3451 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
3452 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
3453 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
3454 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
3458 #define dspfunc(PFX, IDX, NUM) \
3459 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
3460 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
3461 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
3462 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
3463 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
3464 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
3465 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
3466 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
3467 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
3468 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
3469 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
3470 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
3471 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
3472 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
3473 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
3474 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
3476 dspfunc(put_h264_qpel, 0, 16);
3477 dspfunc(put_h264_qpel, 1, 8);
3478 dspfunc(put_h264_qpel, 2, 4);
3479 dspfunc(avg_h264_qpel, 0, 16);
3480 dspfunc(avg_h264_qpel, 1, 8);
3481 dspfunc(avg_h264_qpel, 2, 4);
3483 dspfunc(put_2tap_qpel, 0, 16);
3484 dspfunc(put_2tap_qpel, 1, 8);
3485 dspfunc(avg_2tap_qpel, 0, 16);
3486 dspfunc(avg_2tap_qpel, 1, 8);
3489 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
3490 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3491 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
3492 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
3493 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
3494 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
3495 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
3496 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
3497 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
3498 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3499 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
3501 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3502 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3503 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3504 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3505 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3506 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3507 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3508 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3510 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3511 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3512 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3513 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3514 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3515 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3516 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3517 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3519 #ifdef CONFIG_CAVS_DECODER
3520 ff_cavsdsp_init_mmx2(c, avctx);
3523 #ifdef CONFIG_ENCODERS
3524 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3525 #endif //CONFIG_ENCODERS
3526 } else if (mm_flags & MM_3DNOW) {
3527 c->prefetch = prefetch_3dnow;
3529 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
3530 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
3532 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
3533 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
3534 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
3536 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
3537 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
3539 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
3540 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
3541 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
3543 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3544 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
3545 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
3546 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
3547 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
3548 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
3549 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
3552 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
3553 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
3554 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
3555 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
3556 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
3557 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
3558 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
3559 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
3560 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
3561 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
3562 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
3563 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
3564 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
3565 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
3566 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
3567 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
3568 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
3569 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
3570 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
3571 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
3572 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
3573 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
3574 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
3575 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
3576 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
3577 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
3578 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
3579 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
3580 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
3581 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
3582 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
3583 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
3585 #define dspfunc(PFX, IDX, NUM) \
3586 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
3587 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
3588 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
3589 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
3590 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
3591 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
3592 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
3593 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
3594 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
3595 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
3596 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
3597 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
3598 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
3599 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
3600 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
3601 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
3603 dspfunc(put_h264_qpel, 0, 16);
3604 dspfunc(put_h264_qpel, 1, 8);
3605 dspfunc(put_h264_qpel, 2, 4);
3606 dspfunc(avg_h264_qpel, 0, 16);
3607 dspfunc(avg_h264_qpel, 1, 8);
3608 dspfunc(avg_h264_qpel, 2, 4);
3610 dspfunc(put_2tap_qpel, 0, 16);
3611 dspfunc(put_2tap_qpel, 1, 8);
3612 dspfunc(avg_2tap_qpel, 0, 16);
3613 dspfunc(avg_2tap_qpel, 1, 8);
3615 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
3616 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
3619 #ifdef CONFIG_ENCODERS
3620 if(mm_flags & MM_SSE2){
3621 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3622 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3623 c->hadamard8_diff[1]= hadamard8_diff_sse2;
3627 if(mm_flags & MM_SSSE3){
3628 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
3629 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
3630 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
3635 #ifdef CONFIG_SNOW_DECODER
3636 if(mm_flags & MM_SSE2){
3637 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3638 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3639 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3642 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3643 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3644 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3648 if(mm_flags & MM_3DNOW){
3649 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
3650 c->vector_fmul = vector_fmul_3dnow;
3651 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
3652 c->float_to_int16 = float_to_int16_3dnow;
3654 if(mm_flags & MM_3DNOWEXT)
3655 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
3656 if(mm_flags & MM_SSE){
3657 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3658 c->vector_fmul = vector_fmul_sse;
3659 c->float_to_int16 = float_to_int16_sse;
3660 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3661 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3663 if(mm_flags & MM_3DNOW)
3664 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
3667 #ifdef CONFIG_ENCODERS
3668 dsputil_init_pix_mmx(c, avctx);
3669 #endif //CONFIG_ENCODERS
3671 // for speed testing
3672 get_pixels = just_return;
3673 put_pixels_clamped = just_return;
3674 add_pixels_clamped = just_return;
3676 pix_abs16x16 = just_return;
3677 pix_abs16x16_x2 = just_return;
3678 pix_abs16x16_y2 = just_return;
3679 pix_abs16x16_xy2 = just_return;
3681 put_pixels_tab[0] = just_return;
3682 put_pixels_tab[1] = just_return;
3683 put_pixels_tab[2] = just_return;
3684 put_pixels_tab[3] = just_return;
3686 put_no_rnd_pixels_tab[0] = just_return;
3687 put_no_rnd_pixels_tab[1] = just_return;
3688 put_no_rnd_pixels_tab[2] = just_return;
3689 put_no_rnd_pixels_tab[3] = just_return;
3691 avg_pixels_tab[0] = just_return;
3692 avg_pixels_tab[1] = just_return;
3693 avg_pixels_tab[2] = just_return;
3694 avg_pixels_tab[3] = just_return;
3696 avg_no_rnd_pixels_tab[0] = just_return;
3697 avg_no_rnd_pixels_tab[1] = just_return;
3698 avg_no_rnd_pixels_tab[2] = just_return;
3699 avg_no_rnd_pixels_tab[3] = just_return;
3701 //av_fdct = just_return;
3702 //ff_idct = just_return;