2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
22 #include "../dsputil.h"
23 #include "../simple_idct.h"
25 int mm_flags; /* multimedia extension flags */
27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
47 /* external functions, from idct_mmx.c */
48 void ff_mmx_idct(DCTELEM *block);
49 void ff_mmxext_idct(DCTELEM *block);
51 /* pixel operations */
52 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
55 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
56 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
58 #define JUMPALIGN() __asm __volatile (".balign 8"::)
59 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
62 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
63 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
64 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
66 // for shared library it's better to use this way for accessing constants
68 #define MOVQ_WONE(regd) \
70 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
71 "psrlw $15, %%" #regd ::)
73 #define MOVQ_WTWO(regd) \
75 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
76 "psrlw $15, %%" #regd " \n\t" \
77 "psllw $1, %%" #regd ::)
79 #define MOVQ_BONE(regd) \
80 "pcmpeqd " #regd ", " #regd " \n\t" \
81 "psrlw $15, " #regd " \n\t"\
82 "packuswb " #regd ", " #regd " \n\t"
86 /***********************************/
89 #define DEF(x) x ## _3dnow
90 /* for Athlons PAVGUSB is prefered */
91 #define PAVGB "pavgusb"
93 #include "dsputil_mmx_avg.h"
98 /***********************************/
101 #define DEF(x) x ## _mmx2
103 /* Introduced only in MMX2 set */
104 #define PAVGB "pavgb"
106 #include "dsputil_mmx_avg.h"
111 /***********************************/
114 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
117 "movl $-128, %%eax \n\t"
118 "pxor %%mm7, %%mm7 \n\t"
121 "movq (%0), %%mm0 \n\t"
122 "movq (%0, %2), %%mm2 \n\t"
123 "movq %%mm0, %%mm1 \n\t"
124 "movq %%mm2, %%mm3 \n\t"
125 "punpcklbw %%mm7, %%mm0 \n\t"
126 "punpckhbw %%mm7, %%mm1 \n\t"
127 "punpcklbw %%mm7, %%mm2 \n\t"
128 "punpckhbw %%mm7, %%mm3 \n\t"
129 "movq %%mm0, (%1, %%eax)\n\t"
130 "movq %%mm1, 8(%1, %%eax)\n\t"
131 "movq %%mm2, 16(%1, %%eax)\n\t"
132 "movq %%mm3, 24(%1, %%eax)\n\t"
134 "addl $32, %%eax \n\t"
137 : "r" (block+64), "r" (line_size), "r" (line_size*2)
142 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
145 "pxor %%mm7, %%mm7 \n\t"
146 "movl $-128, %%eax \n\t"
149 "movq (%0), %%mm0 \n\t"
150 "movq (%1), %%mm2 \n\t"
151 "movq %%mm0, %%mm1 \n\t"
152 "movq %%mm2, %%mm3 \n\t"
153 "punpcklbw %%mm7, %%mm0 \n\t"
154 "punpckhbw %%mm7, %%mm1 \n\t"
155 "punpcklbw %%mm7, %%mm2 \n\t"
156 "punpckhbw %%mm7, %%mm3 \n\t"
157 "psubw %%mm2, %%mm0 \n\t"
158 "psubw %%mm3, %%mm1 \n\t"
159 "movq %%mm0, (%2, %%eax)\n\t"
160 "movq %%mm1, 8(%2, %%eax)\n\t"
163 "addl $16, %%eax \n\t"
165 : "+r" (s1), "+r" (s2)
166 : "r" (block+64), "r" (stride)
171 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
176 /* read the pixels */
182 "movq 8%3, %%mm1\n\t"
183 "movq 16%3, %%mm2\n\t"
184 "movq 24%3, %%mm3\n\t"
185 "movq 32%3, %%mm4\n\t"
186 "movq 40%3, %%mm5\n\t"
187 "movq 48%3, %%mm6\n\t"
188 "movq 56%3, %%mm7\n\t"
189 "packuswb %%mm1, %%mm0\n\t"
190 "packuswb %%mm3, %%mm2\n\t"
191 "packuswb %%mm5, %%mm4\n\t"
192 "packuswb %%mm7, %%mm6\n\t"
193 "movq %%mm0, (%0)\n\t"
194 "movq %%mm2, (%0, %1)\n\t"
195 "movq %%mm4, (%0, %1, 2)\n\t"
196 "movq %%mm6, (%0, %2)\n\t"
197 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
202 // if here would be an exact copy of the code above
203 // compiler would generate some very strange code
206 "movq (%3), %%mm0\n\t"
207 "movq 8(%3), %%mm1\n\t"
208 "movq 16(%3), %%mm2\n\t"
209 "movq 24(%3), %%mm3\n\t"
210 "movq 32(%3), %%mm4\n\t"
211 "movq 40(%3), %%mm5\n\t"
212 "movq 48(%3), %%mm6\n\t"
213 "movq 56(%3), %%mm7\n\t"
214 "packuswb %%mm1, %%mm0\n\t"
215 "packuswb %%mm3, %%mm2\n\t"
216 "packuswb %%mm5, %%mm4\n\t"
217 "packuswb %%mm7, %%mm6\n\t"
218 "movq %%mm0, (%0)\n\t"
219 "movq %%mm2, (%0, %1)\n\t"
220 "movq %%mm4, (%0, %1, 2)\n\t"
221 "movq %%mm6, (%0, %2)\n\t"
222 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
226 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
232 /* read the pixels */
239 "movq (%2), %%mm0\n\t"
240 "movq 8(%2), %%mm1\n\t"
241 "movq 16(%2), %%mm2\n\t"
242 "movq 24(%2), %%mm3\n\t"
245 "movq %%mm4, %%mm5\n\t"
246 "punpcklbw %%mm7, %%mm4\n\t"
247 "punpckhbw %%mm7, %%mm5\n\t"
248 "paddsw %%mm4, %%mm0\n\t"
249 "paddsw %%mm5, %%mm1\n\t"
250 "movq %%mm6, %%mm5\n\t"
251 "punpcklbw %%mm7, %%mm6\n\t"
252 "punpckhbw %%mm7, %%mm5\n\t"
253 "paddsw %%mm6, %%mm2\n\t"
254 "paddsw %%mm5, %%mm3\n\t"
255 "packuswb %%mm1, %%mm0\n\t"
256 "packuswb %%mm3, %%mm2\n\t"
259 :"+m"(*pix), "+m"(*(pix+line_size))
267 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
271 "lea (%3, %3), %%eax \n\t"
274 "movq (%1), %%mm0 \n\t"
275 "movq (%1, %3), %%mm1 \n\t"
276 "movq %%mm0, (%2) \n\t"
277 "movq %%mm1, (%2, %3) \n\t"
278 "addl %%eax, %1 \n\t"
279 "addl %%eax, %2 \n\t"
280 "movq (%1), %%mm0 \n\t"
281 "movq (%1, %3), %%mm1 \n\t"
282 "movq %%mm0, (%2) \n\t"
283 "movq %%mm1, (%2, %3) \n\t"
284 "addl %%eax, %1 \n\t"
285 "addl %%eax, %2 \n\t"
288 : "+g"(h), "+r" (pixels), "+r" (block)
294 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
306 "movq 1%1, %%mm1\n\t"
307 "movq %%mm0, %%mm2\n\t"
308 "movq %%mm1, %%mm3\n\t"
309 "punpcklbw %%mm7, %%mm0\n\t"
310 "punpcklbw %%mm7, %%mm1\n\t"
311 "punpckhbw %%mm7, %%mm2\n\t"
312 "punpckhbw %%mm7, %%mm3\n\t"
313 "paddusw %%mm1, %%mm0\n\t"
314 "paddusw %%mm3, %%mm2\n\t"
315 "paddusw %%mm4, %%mm0\n\t"
316 "paddusw %%mm4, %%mm2\n\t"
317 "psrlw $1, %%mm0\n\t"
318 "psrlw $1, %%mm2\n\t"
319 "packuswb %%mm2, %%mm0\n\t"
324 pix += line_size; p += line_size;
328 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
341 "movq %%mm0, %%mm2\n\t"
342 "movq %%mm1, %%mm3\n\t"
343 "punpcklbw %%mm7, %%mm0\n\t"
344 "punpcklbw %%mm7, %%mm1\n\t"
345 "punpckhbw %%mm7, %%mm2\n\t"
346 "punpckhbw %%mm7, %%mm3\n\t"
347 "paddusw %%mm1, %%mm0\n\t"
348 "paddusw %%mm3, %%mm2\n\t"
349 "paddusw %%mm4, %%mm0\n\t"
350 "paddusw %%mm4, %%mm2\n\t"
351 "psrlw $1, %%mm0\n\t"
352 "psrlw $1, %%mm2\n\t"
353 "packuswb %%mm2, %%mm0\n\t"
357 "m"(*(pix+line_size))
364 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
377 "movq 1%1, %%mm4\n\t"
378 "movq 1%2, %%mm5\n\t"
379 "movq %%mm0, %%mm2\n\t"
380 "movq %%mm1, %%mm3\n\t"
381 "punpcklbw %%mm7, %%mm0\n\t"
382 "punpcklbw %%mm7, %%mm1\n\t"
383 "punpckhbw %%mm7, %%mm2\n\t"
384 "punpckhbw %%mm7, %%mm3\n\t"
385 "paddusw %%mm1, %%mm0\n\t"
386 "paddusw %%mm3, %%mm2\n\t"
387 "movq %%mm4, %%mm1\n\t"
388 "movq %%mm5, %%mm3\n\t"
389 "punpcklbw %%mm7, %%mm4\n\t"
390 "punpcklbw %%mm7, %%mm5\n\t"
391 "punpckhbw %%mm7, %%mm1\n\t"
392 "punpckhbw %%mm7, %%mm3\n\t"
393 "paddusw %%mm5, %%mm4\n\t"
394 "paddusw %%mm3, %%mm1\n\t"
395 "paddusw %%mm6, %%mm4\n\t"
396 "paddusw %%mm6, %%mm1\n\t"
397 "paddusw %%mm4, %%mm0\n\t"
398 "paddusw %%mm1, %%mm2\n\t"
399 "psrlw $2, %%mm0\n\t"
400 "psrlw $2, %%mm2\n\t"
401 "packuswb %%mm2, %%mm0\n\t"
405 "m"(*(pix+line_size))
412 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
422 "movq 1%1, %%mm1\n\t"
423 "movq %%mm0, %%mm2\n\t"
424 "movq %%mm1, %%mm3\n\t"
425 "punpcklbw %%mm7, %%mm0\n\t"
426 "punpcklbw %%mm7, %%mm1\n\t"
427 "punpckhbw %%mm7, %%mm2\n\t"
428 "punpckhbw %%mm7, %%mm3\n\t"
429 "paddusw %%mm1, %%mm0\n\t"
430 "paddusw %%mm3, %%mm2\n\t"
431 "psrlw $1, %%mm0\n\t"
432 "psrlw $1, %%mm2\n\t"
433 "packuswb %%mm2, %%mm0\n\t"
443 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
455 "movq %%mm0, %%mm2\n\t"
456 "movq %%mm1, %%mm3\n\t"
457 "punpcklbw %%mm7, %%mm0\n\t"
458 "punpcklbw %%mm7, %%mm1\n\t"
459 "punpckhbw %%mm7, %%mm2\n\t"
460 "punpckhbw %%mm7, %%mm3\n\t"
461 "paddusw %%mm1, %%mm0\n\t"
462 "paddusw %%mm3, %%mm2\n\t"
463 "psrlw $1, %%mm0\n\t"
464 "psrlw $1, %%mm2\n\t"
465 "packuswb %%mm2, %%mm0\n\t"
469 "m"(*(pix+line_size))
476 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
489 "movq 1%1, %%mm4\n\t"
490 "movq 1%2, %%mm5\n\t"
491 "movq %%mm0, %%mm2\n\t"
492 "movq %%mm1, %%mm3\n\t"
493 "punpcklbw %%mm7, %%mm0\n\t"
494 "punpcklbw %%mm7, %%mm1\n\t"
495 "punpckhbw %%mm7, %%mm2\n\t"
496 "punpckhbw %%mm7, %%mm3\n\t"
497 "paddusw %%mm1, %%mm0\n\t"
498 "paddusw %%mm3, %%mm2\n\t"
499 "movq %%mm4, %%mm1\n\t"
500 "movq %%mm5, %%mm3\n\t"
501 "punpcklbw %%mm7, %%mm4\n\t"
502 "punpcklbw %%mm7, %%mm5\n\t"
503 "punpckhbw %%mm7, %%mm1\n\t"
504 "punpckhbw %%mm7, %%mm3\n\t"
505 "paddusw %%mm5, %%mm4\n\t"
506 "paddusw %%mm3, %%mm1\n\t"
507 "paddusw %%mm6, %%mm4\n\t"
508 "paddusw %%mm6, %%mm1\n\t"
509 "paddusw %%mm4, %%mm0\n\t"
510 "paddusw %%mm1, %%mm2\n\t"
511 "psrlw $2, %%mm0\n\t"
512 "psrlw $2, %%mm2\n\t"
513 "packuswb %%mm2, %%mm0\n\t"
517 "m"(*(pix+line_size))
524 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
537 "movq %%mm0, %%mm2\n\t"
538 "movq %%mm1, %%mm3\n\t"
539 "punpcklbw %%mm7, %%mm0\n\t"
540 "punpcklbw %%mm7, %%mm1\n\t"
541 "punpckhbw %%mm7, %%mm2\n\t"
542 "punpckhbw %%mm7, %%mm3\n\t"
543 "paddusw %%mm1, %%mm0\n\t"
544 "paddusw %%mm3, %%mm2\n\t"
545 "paddusw %%mm6, %%mm0\n\t"
546 "paddusw %%mm6, %%mm2\n\t"
547 "psrlw $1, %%mm0\n\t"
548 "psrlw $1, %%mm2\n\t"
549 "packuswb %%mm2, %%mm0\n\t"
560 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
573 "movq 1%1, %%mm4\n\t"
574 "movq %%mm0, %%mm2\n\t"
575 "movq %%mm1, %%mm3\n\t"
576 "movq %%mm4, %%mm5\n\t"
577 "punpcklbw %%mm7, %%mm1\n\t"
578 "punpckhbw %%mm7, %%mm3\n\t"
579 "punpcklbw %%mm7, %%mm4\n\t"
580 "punpckhbw %%mm7, %%mm5\n\t"
581 "punpcklbw %%mm7, %%mm0\n\t"
582 "punpckhbw %%mm7, %%mm2\n\t"
583 "paddusw %%mm4, %%mm1\n\t"
584 "paddusw %%mm5, %%mm3\n\t"
585 "paddusw %%mm6, %%mm1\n\t"
586 "paddusw %%mm6, %%mm3\n\t"
587 "psrlw $1, %%mm1\n\t"
588 "psrlw $1, %%mm3\n\t"
589 "paddusw %%mm6, %%mm0\n\t"
590 "paddusw %%mm6, %%mm2\n\t"
591 "paddusw %%mm1, %%mm0\n\t"
592 "paddusw %%mm3, %%mm2\n\t"
593 "psrlw $1, %%mm0\n\t"
594 "psrlw $1, %%mm2\n\t"
595 "packuswb %%mm2, %%mm0\n\t"
605 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
619 "movq %%mm0, %%mm2\n\t"
620 "movq %%mm1, %%mm3\n\t"
621 "movq %%mm4, %%mm5\n\t"
622 "punpcklbw %%mm7, %%mm1\n\t"
623 "punpckhbw %%mm7, %%mm3\n\t"
624 "punpcklbw %%mm7, %%mm4\n\t"
625 "punpckhbw %%mm7, %%mm5\n\t"
626 "punpcklbw %%mm7, %%mm0\n\t"
627 "punpckhbw %%mm7, %%mm2\n\t"
628 "paddusw %%mm4, %%mm1\n\t"
629 "paddusw %%mm5, %%mm3\n\t"
630 "paddusw %%mm6, %%mm1\n\t"
631 "paddusw %%mm6, %%mm3\n\t"
632 "psrlw $1, %%mm1\n\t"
633 "psrlw $1, %%mm3\n\t"
634 "paddusw %%mm6, %%mm0\n\t"
635 "paddusw %%mm6, %%mm2\n\t"
636 "paddusw %%mm1, %%mm0\n\t"
637 "paddusw %%mm3, %%mm2\n\t"
638 "psrlw $1, %%mm0\n\t"
639 "psrlw $1, %%mm2\n\t"
640 "packuswb %%mm2, %%mm0\n\t"
643 :"m"(*pix), "m"(*(pix+line_size))
650 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
657 // this doesn't seem to be used offten - so
658 // the inside usage of mm_wone is not optimized
664 "movq 1%1, %%mm4\n\t"
665 "movq 1%2, %%mm5\n\t"
666 "movq %%mm0, %%mm2\n\t"
667 "movq %%mm1, %%mm3\n\t"
668 "punpcklbw %%mm7, %%mm0\n\t"
669 "punpcklbw %%mm7, %%mm1\n\t"
670 "punpckhbw %%mm7, %%mm2\n\t"
671 "punpckhbw %%mm7, %%mm3\n\t"
672 "paddusw %%mm1, %%mm0\n\t"
673 "paddusw %%mm3, %%mm2\n\t"
674 "movq %%mm4, %%mm1\n\t"
675 "movq %%mm5, %%mm3\n\t"
676 "punpcklbw %%mm7, %%mm4\n\t"
677 "punpcklbw %%mm7, %%mm5\n\t"
678 "punpckhbw %%mm7, %%mm1\n\t"
679 "punpckhbw %%mm7, %%mm3\n\t"
680 "paddusw %%mm5, %%mm4\n\t"
681 "paddusw %%mm3, %%mm1\n\t"
682 "paddusw %%mm6, %%mm4\n\t"
683 "paddusw %%mm6, %%mm1\n\t"
684 "paddusw %%mm4, %%mm0\n\t"
685 "paddusw %%mm1, %%mm2\n\t"
687 "psrlw $2, %%mm0\n\t"
689 "psrlw $2, %%mm2\n\t"
690 "movq %%mm1, %%mm3\n\t"
691 "punpcklbw %%mm7, %%mm1\n\t"
692 "punpckhbw %%mm7, %%mm3\n\t"
693 "paddusw %%mm1, %%mm0\n\t"
694 "paddusw %%mm3, %%mm2\n\t"
695 "paddusw %%mm5, %%mm0\n\t"
696 "paddusw %%mm5, %%mm2\n\t"
697 "psrlw $1, %%mm0\n\t"
698 "psrlw $1, %%mm2\n\t"
699 "packuswb %%mm2, %%mm0\n\t"
703 "m"(*(pix+line_size)), "m"(mm_wone)
710 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
721 "movq %%mm0, %%mm2\n\t"
722 "movq %%mm1, %%mm3\n\t"
723 "punpcklbw %%mm7, %%mm0\n\t"
724 "punpcklbw %%mm7, %%mm1\n\t"
725 "punpckhbw %%mm7, %%mm2\n\t"
726 "punpckhbw %%mm7, %%mm3\n\t"
727 "paddusw %%mm1, %%mm0\n\t"
728 "paddusw %%mm3, %%mm2\n\t"
729 "psrlw $1, %%mm0\n\t"
730 "psrlw $1, %%mm2\n\t"
731 "packuswb %%mm2, %%mm0\n\t"
741 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
751 "movq 1%1, %%mm1\n\t"
753 "movq %%mm0, %%mm2\n\t"
754 "movq %%mm1, %%mm3\n\t"
755 "movq %%mm4, %%mm5\n\t"
756 "punpcklbw %%mm7, %%mm0\n\t"
757 "punpcklbw %%mm7, %%mm1\n\t"
758 "punpckhbw %%mm7, %%mm2\n\t"
759 "punpckhbw %%mm7, %%mm3\n\t"
760 "punpcklbw %%mm7, %%mm4\n\t"
761 "punpckhbw %%mm7, %%mm5\n\t"
762 "paddusw %%mm1, %%mm0\n\t"
763 "paddusw %%mm3, %%mm2\n\t"
764 "psrlw $1, %%mm0\n\t"
765 "psrlw $1, %%mm2\n\t"
766 "paddusw %%mm4, %%mm0\n\t"
767 "paddusw %%mm5, %%mm2\n\t"
768 "psrlw $1, %%mm0\n\t"
769 "psrlw $1, %%mm2\n\t"
770 "packuswb %%mm2, %%mm0\n\t"
780 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
792 "movq %%mm0, %%mm2\n\t"
793 "movq %%mm1, %%mm3\n\t"
794 "movq %%mm4, %%mm5\n\t"
795 "punpcklbw %%mm7, %%mm0\n\t"
796 "punpcklbw %%mm7, %%mm1\n\t"
797 "punpckhbw %%mm7, %%mm2\n\t"
798 "punpckhbw %%mm7, %%mm3\n\t"
799 "punpcklbw %%mm7, %%mm4\n\t"
800 "punpckhbw %%mm7, %%mm5\n\t"
801 "paddusw %%mm1, %%mm0\n\t"
802 "paddusw %%mm3, %%mm2\n\t"
803 "psrlw $1, %%mm0\n\t"
804 "psrlw $1, %%mm2\n\t"
805 "paddusw %%mm4, %%mm0\n\t"
806 "paddusw %%mm5, %%mm2\n\t"
807 "psrlw $1, %%mm0\n\t"
808 "psrlw $1, %%mm2\n\t"
809 "packuswb %%mm2, %%mm0\n\t"
812 :"m"(*pix), "m"(*(pix+line_size))
819 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
832 "movq 1%1, %%mm4\n\t"
833 "movq 1%2, %%mm5\n\t"
834 "movq %%mm0, %%mm2\n\t"
835 "movq %%mm1, %%mm3\n\t"
836 "punpcklbw %%mm7, %%mm0\n\t"
837 "punpcklbw %%mm7, %%mm1\n\t"
838 "punpckhbw %%mm7, %%mm2\n\t"
839 "punpckhbw %%mm7, %%mm3\n\t"
840 "paddusw %%mm1, %%mm0\n\t"
841 "paddusw %%mm3, %%mm2\n\t"
842 "movq %%mm4, %%mm1\n\t"
843 "movq %%mm5, %%mm3\n\t"
844 "punpcklbw %%mm7, %%mm4\n\t"
845 "punpcklbw %%mm7, %%mm5\n\t"
846 "punpckhbw %%mm7, %%mm1\n\t"
847 "punpckhbw %%mm7, %%mm3\n\t"
848 "paddusw %%mm5, %%mm4\n\t"
849 "paddusw %%mm3, %%mm1\n\t"
850 "paddusw %%mm6, %%mm4\n\t"
851 "paddusw %%mm6, %%mm1\n\t"
852 "paddusw %%mm4, %%mm0\n\t"
853 "paddusw %%mm1, %%mm2\n\t"
855 "psrlw $2, %%mm0\n\t"
856 "movq %%mm1, %%mm3\n\t"
857 "psrlw $2, %%mm2\n\t"
858 "punpcklbw %%mm7, %%mm1\n\t"
859 "punpckhbw %%mm7, %%mm3\n\t"
860 "paddusw %%mm1, %%mm0\n\t"
861 "paddusw %%mm3, %%mm2\n\t"
862 "psrlw $1, %%mm0\n\t"
863 "psrlw $1, %%mm2\n\t"
864 "packuswb %%mm2, %%mm0\n\t"
868 "m"(*(pix+line_size))
875 static void clear_blocks_mmx(DCTELEM *blocks)
878 "pxor %%mm7, %%mm7 \n\t"
879 "movl $-128*6, %%eax \n\t"
881 "movq %%mm7, (%0, %%eax) \n\t"
882 "movq %%mm7, 8(%0, %%eax) \n\t"
883 "movq %%mm7, 16(%0, %%eax) \n\t"
884 "movq %%mm7, 24(%0, %%eax) \n\t"
885 "addl $32, %%eax \n\t"
887 : : "r" (((int)blocks)+128*6)
893 static void just_return() { return; }
897 void dsputil_init_mmx(void)
899 mm_flags = mm_support();
901 printf("libavcodec: CPU flags:");
902 if (mm_flags & MM_MMX)
904 if (mm_flags & MM_MMXEXT)
906 if (mm_flags & MM_3DNOW)
908 if (mm_flags & MM_SSE)
910 if (mm_flags & MM_SSE2)
915 if (mm_flags & MM_MMX) {
916 get_pixels = get_pixels_mmx;
917 diff_pixels = diff_pixels_mmx;
918 put_pixels_clamped = put_pixels_clamped_mmx;
919 add_pixels_clamped = add_pixels_clamped_mmx;
920 clear_blocks= clear_blocks_mmx;
922 pix_abs16x16 = pix_abs16x16_mmx;
923 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
924 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
925 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
926 pix_abs8x8 = pix_abs8x8_mmx;
927 pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
928 pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
929 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
932 put_pixels_tab[0] = put_pixels_mmx;
933 put_pixels_tab[1] = put_pixels_x2_mmx;
934 put_pixels_tab[2] = put_pixels_y2_mmx;
935 put_pixels_tab[3] = put_pixels_xy2_mmx;
937 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
938 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
939 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
940 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
942 avg_pixels_tab[0] = avg_pixels_mmx;
943 avg_pixels_tab[1] = avg_pixels_x2_mmx;
944 avg_pixels_tab[2] = avg_pixels_y2_mmx;
945 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
947 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
948 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
949 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
950 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
952 if (mm_flags & MM_MMXEXT) {
953 pix_abs16x16 = pix_abs16x16_mmx2;
954 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
955 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
956 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
958 pix_abs8x8 = pix_abs8x8_mmx2;
959 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
960 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
961 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
963 put_pixels_tab[1] = put_pixels_x2_mmx2;
964 put_pixels_tab[2] = put_pixels_y2_mmx2;
965 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
966 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
968 avg_pixels_tab[0] = avg_pixels_mmx2;
969 avg_pixels_tab[1] = avg_pixels_x2_mmx2;
970 avg_pixels_tab[2] = avg_pixels_y2_mmx2;
971 avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
972 } else if (mm_flags & MM_3DNOW) {
973 put_pixels_tab[1] = put_pixels_x2_3dnow;
974 put_pixels_tab[2] = put_pixels_y2_3dnow;
975 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
976 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
978 avg_pixels_tab[0] = avg_pixels_3dnow;
979 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
980 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
981 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
985 if (mm_flags & MM_MMXEXT) {
986 ff_idct = ff_mmxext_idct;
988 ff_idct = ff_mmx_idct;
991 // ff_idct = simple_idct;
992 ff_idct = simple_idct_mmx;
998 get_pixels = just_return;
999 put_pixels_clamped = just_return;
1000 add_pixels_clamped = just_return;
1002 pix_abs16x16 = just_return;
1003 pix_abs16x16_x2 = just_return;
1004 pix_abs16x16_y2 = just_return;
1005 pix_abs16x16_xy2 = just_return;
1007 put_pixels_tab[0] = just_return;
1008 put_pixels_tab[1] = just_return;
1009 put_pixels_tab[2] = just_return;
1010 put_pixels_tab[3] = just_return;
1012 put_no_rnd_pixels_tab[0] = just_return;
1013 put_no_rnd_pixels_tab[1] = just_return;
1014 put_no_rnd_pixels_tab[2] = just_return;
1015 put_no_rnd_pixels_tab[3] = just_return;
1017 avg_pixels_tab[0] = just_return;
1018 avg_pixels_tab[1] = just_return;
1019 avg_pixels_tab[2] = just_return;
1020 avg_pixels_tab[3] = just_return;
1022 avg_no_rnd_pixels_tab[0] = just_return;
1023 avg_no_rnd_pixels_tab[1] = just_return;
1024 avg_no_rnd_pixels_tab[2] = just_return;
1025 avg_no_rnd_pixels_tab[3] = just_return;
1027 //av_fdct = just_return;
1028 //ff_idct = just_return;
1032 /* remove any non bit exact operation (testing purpose). NOTE that
1033 this function should be kept as small as possible because it is
1034 always difficult to test automatically non bit exact cases. */
1035 void dsputil_set_bit_exact_mmx(void)
1037 if (mm_flags & MM_MMX) {
1038 if (mm_flags & MM_MMXEXT) {
1039 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1040 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1041 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
1042 } else if (mm_flags & MM_3DNOW) {
1043 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
1044 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
1045 avg_pixels_tab[3] = avg_pixels_xy2_mmx;