2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/x86_cpu.h"
26 #include "libavcodec/dsputil.h"
27 #include "libavcodec/mpegvideo.h"
28 #include "dsputil_mmx.h"
31 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
34 "mov $-128, %%"REG_a" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
51 "add $32, %%"REG_a" \n\t"
54 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
59 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
62 "pxor %%xmm7, %%xmm7 \n\t"
63 "movq (%0), %%xmm0 \n\t"
64 "movq (%0, %2), %%xmm1 \n\t"
65 "movq (%0, %2,2), %%xmm2 \n\t"
66 "movq (%0, %3), %%xmm3 \n\t"
67 "lea (%0,%2,4), %0 \n\t"
68 "punpcklbw %%xmm7, %%xmm0 \n\t"
69 "punpcklbw %%xmm7, %%xmm1 \n\t"
70 "punpcklbw %%xmm7, %%xmm2 \n\t"
71 "punpcklbw %%xmm7, %%xmm3 \n\t"
72 "movdqa %%xmm0, (%1) \n\t"
73 "movdqa %%xmm1, 16(%1) \n\t"
74 "movdqa %%xmm2, 32(%1) \n\t"
75 "movdqa %%xmm3, 48(%1) \n\t"
76 "movq (%0), %%xmm0 \n\t"
77 "movq (%0, %2), %%xmm1 \n\t"
78 "movq (%0, %2,2), %%xmm2 \n\t"
79 "movq (%0, %3), %%xmm3 \n\t"
80 "punpcklbw %%xmm7, %%xmm0 \n\t"
81 "punpcklbw %%xmm7, %%xmm1 \n\t"
82 "punpcklbw %%xmm7, %%xmm2 \n\t"
83 "punpcklbw %%xmm7, %%xmm3 \n\t"
84 "movdqa %%xmm0, 64(%1) \n\t"
85 "movdqa %%xmm1, 80(%1) \n\t"
86 "movdqa %%xmm2, 96(%1) \n\t"
87 "movdqa %%xmm3, 112(%1) \n\t"
89 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
93 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
96 "pxor %%mm7, %%mm7 \n\t"
97 "mov $-128, %%"REG_a" \n\t"
100 "movq (%0), %%mm0 \n\t"
101 "movq (%1), %%mm2 \n\t"
102 "movq %%mm0, %%mm1 \n\t"
103 "movq %%mm2, %%mm3 \n\t"
104 "punpcklbw %%mm7, %%mm0 \n\t"
105 "punpckhbw %%mm7, %%mm1 \n\t"
106 "punpcklbw %%mm7, %%mm2 \n\t"
107 "punpckhbw %%mm7, %%mm3 \n\t"
108 "psubw %%mm2, %%mm0 \n\t"
109 "psubw %%mm3, %%mm1 \n\t"
110 "movq %%mm0, (%2, %%"REG_a") \n\t"
111 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
114 "add $16, %%"REG_a" \n\t"
116 : "+r" (s1), "+r" (s2)
117 : "r" (block+64), "r" ((x86_reg)stride)
122 static int pix_sum16_mmx(uint8_t * pix, int line_size){
125 x86_reg index= -line_size*h;
128 "pxor %%mm7, %%mm7 \n\t"
129 "pxor %%mm6, %%mm6 \n\t"
131 "movq (%2, %1), %%mm0 \n\t"
132 "movq (%2, %1), %%mm1 \n\t"
133 "movq 8(%2, %1), %%mm2 \n\t"
134 "movq 8(%2, %1), %%mm3 \n\t"
135 "punpcklbw %%mm7, %%mm0 \n\t"
136 "punpckhbw %%mm7, %%mm1 \n\t"
137 "punpcklbw %%mm7, %%mm2 \n\t"
138 "punpckhbw %%mm7, %%mm3 \n\t"
139 "paddw %%mm0, %%mm1 \n\t"
140 "paddw %%mm2, %%mm3 \n\t"
141 "paddw %%mm1, %%mm3 \n\t"
142 "paddw %%mm3, %%mm6 \n\t"
145 "movq %%mm6, %%mm5 \n\t"
146 "psrlq $32, %%mm6 \n\t"
147 "paddw %%mm5, %%mm6 \n\t"
148 "movq %%mm6, %%mm5 \n\t"
149 "psrlq $16, %%mm6 \n\t"
150 "paddw %%mm5, %%mm6 \n\t"
151 "movd %%mm6, %0 \n\t"
152 "andl $0xFFFF, %0 \n\t"
153 : "=&r" (sum), "+r" (index)
154 : "r" (pix - index), "r" ((x86_reg)line_size)
160 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
167 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
168 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
170 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
172 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
173 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
175 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
176 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
177 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
179 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
180 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
182 "pmaddwd %%mm3,%%mm3\n"
183 "pmaddwd %%mm4,%%mm4\n"
185 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
186 pix2^2+pix3^2+pix6^2+pix7^2) */
187 "paddd %%mm3,%%mm4\n"
188 "paddd %%mm2,%%mm7\n"
191 "paddd %%mm4,%%mm7\n"
196 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
197 "paddd %%mm7,%%mm1\n"
199 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
203 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
208 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
209 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
211 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
212 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
213 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
214 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
216 /* todo: mm1-mm2, mm3-mm4 */
217 /* algo: subtract mm1 from mm2 with saturation and vice versa */
218 /* OR the results to get absolute difference */
221 "psubusb %%mm2,%%mm1\n"
222 "psubusb %%mm4,%%mm3\n"
223 "psubusb %%mm5,%%mm2\n"
224 "psubusb %%mm6,%%mm4\n"
229 /* now convert to 16-bit vectors so we can square them */
233 "punpckhbw %%mm0,%%mm2\n"
234 "punpckhbw %%mm0,%%mm4\n"
235 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
236 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
238 "pmaddwd %%mm2,%%mm2\n"
239 "pmaddwd %%mm4,%%mm4\n"
240 "pmaddwd %%mm1,%%mm1\n"
241 "pmaddwd %%mm3,%%mm3\n"
243 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
244 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
246 "paddd %%mm2,%%mm1\n"
247 "paddd %%mm4,%%mm3\n"
248 "paddd %%mm1,%%mm7\n"
249 "paddd %%mm3,%%mm7\n"
255 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
256 "paddd %%mm7,%%mm1\n"
258 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
259 : "r" ((x86_reg)line_size) , "m" (h)
264 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
268 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
269 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
271 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
272 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
273 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
274 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
276 /* todo: mm1-mm2, mm3-mm4 */
277 /* algo: subtract mm1 from mm2 with saturation and vice versa */
278 /* OR the results to get absolute difference */
281 "psubusb %%mm2,%%mm1\n"
282 "psubusb %%mm4,%%mm3\n"
283 "psubusb %%mm5,%%mm2\n"
284 "psubusb %%mm6,%%mm4\n"
289 /* now convert to 16-bit vectors so we can square them */
293 "punpckhbw %%mm0,%%mm2\n"
294 "punpckhbw %%mm0,%%mm4\n"
295 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
296 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
298 "pmaddwd %%mm2,%%mm2\n"
299 "pmaddwd %%mm4,%%mm4\n"
300 "pmaddwd %%mm1,%%mm1\n"
301 "pmaddwd %%mm3,%%mm3\n"
306 "paddd %%mm2,%%mm1\n"
307 "paddd %%mm4,%%mm3\n"
308 "paddd %%mm1,%%mm7\n"
309 "paddd %%mm3,%%mm7\n"
315 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
316 "paddd %%mm7,%%mm1\n"
318 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
319 : "r" ((x86_reg)line_size) , "m" (h)
324 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
328 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
329 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
331 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
332 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
333 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
334 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
336 /* todo: mm1-mm2, mm3-mm4 */
337 /* algo: subtract mm1 from mm2 with saturation and vice versa */
338 /* OR the results to get absolute difference */
339 "movdqa %%xmm1,%%xmm5\n"
340 "movdqa %%xmm3,%%xmm6\n"
341 "psubusb %%xmm2,%%xmm1\n"
342 "psubusb %%xmm4,%%xmm3\n"
343 "psubusb %%xmm5,%%xmm2\n"
344 "psubusb %%xmm6,%%xmm4\n"
346 "por %%xmm1,%%xmm2\n"
347 "por %%xmm3,%%xmm4\n"
349 /* now convert to 16-bit vectors so we can square them */
350 "movdqa %%xmm2,%%xmm1\n"
351 "movdqa %%xmm4,%%xmm3\n"
353 "punpckhbw %%xmm0,%%xmm2\n"
354 "punpckhbw %%xmm0,%%xmm4\n"
355 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
356 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
358 "pmaddwd %%xmm2,%%xmm2\n"
359 "pmaddwd %%xmm4,%%xmm4\n"
360 "pmaddwd %%xmm1,%%xmm1\n"
361 "pmaddwd %%xmm3,%%xmm3\n"
363 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
364 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
366 "paddd %%xmm2,%%xmm1\n"
367 "paddd %%xmm4,%%xmm3\n"
368 "paddd %%xmm1,%%xmm7\n"
369 "paddd %%xmm3,%%xmm7\n"
374 "movdqa %%xmm7,%%xmm1\n"
375 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
376 "paddd %%xmm1,%%xmm7\n"
377 "movdqa %%xmm7,%%xmm1\n"
378 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
379 "paddd %%xmm1,%%xmm7\n"
381 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
382 : "r" ((x86_reg)line_size));
386 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
394 "movq %%mm0, %%mm1\n"
398 "movq %%mm0, %%mm2\n"
399 "movq %%mm1, %%mm3\n"
400 "punpcklbw %%mm7,%%mm0\n"
401 "punpcklbw %%mm7,%%mm1\n"
402 "punpckhbw %%mm7,%%mm2\n"
403 "punpckhbw %%mm7,%%mm3\n"
404 "psubw %%mm1, %%mm0\n"
405 "psubw %%mm3, %%mm2\n"
410 "movq %%mm4, %%mm1\n"
414 "movq %%mm4, %%mm5\n"
415 "movq %%mm1, %%mm3\n"
416 "punpcklbw %%mm7,%%mm4\n"
417 "punpcklbw %%mm7,%%mm1\n"
418 "punpckhbw %%mm7,%%mm5\n"
419 "punpckhbw %%mm7,%%mm3\n"
420 "psubw %%mm1, %%mm4\n"
421 "psubw %%mm3, %%mm5\n"
422 "psubw %%mm4, %%mm0\n"
423 "psubw %%mm5, %%mm2\n"
424 "pxor %%mm3, %%mm3\n"
425 "pxor %%mm1, %%mm1\n"
426 "pcmpgtw %%mm0, %%mm3\n\t"
427 "pcmpgtw %%mm2, %%mm1\n\t"
428 "pxor %%mm3, %%mm0\n"
429 "pxor %%mm1, %%mm2\n"
430 "psubw %%mm3, %%mm0\n"
431 "psubw %%mm1, %%mm2\n"
432 "paddw %%mm0, %%mm2\n"
433 "paddw %%mm2, %%mm6\n"
439 "movq %%mm0, %%mm1\n"
443 "movq %%mm0, %%mm2\n"
444 "movq %%mm1, %%mm3\n"
445 "punpcklbw %%mm7,%%mm0\n"
446 "punpcklbw %%mm7,%%mm1\n"
447 "punpckhbw %%mm7,%%mm2\n"
448 "punpckhbw %%mm7,%%mm3\n"
449 "psubw %%mm1, %%mm0\n"
450 "psubw %%mm3, %%mm2\n"
451 "psubw %%mm0, %%mm4\n"
452 "psubw %%mm2, %%mm5\n"
453 "pxor %%mm3, %%mm3\n"
454 "pxor %%mm1, %%mm1\n"
455 "pcmpgtw %%mm4, %%mm3\n\t"
456 "pcmpgtw %%mm5, %%mm1\n\t"
457 "pxor %%mm3, %%mm4\n"
458 "pxor %%mm1, %%mm5\n"
459 "psubw %%mm3, %%mm4\n"
460 "psubw %%mm1, %%mm5\n"
461 "paddw %%mm4, %%mm5\n"
462 "paddw %%mm5, %%mm6\n"
467 "movq %%mm4, %%mm1\n"
471 "movq %%mm4, %%mm5\n"
472 "movq %%mm1, %%mm3\n"
473 "punpcklbw %%mm7,%%mm4\n"
474 "punpcklbw %%mm7,%%mm1\n"
475 "punpckhbw %%mm7,%%mm5\n"
476 "punpckhbw %%mm7,%%mm3\n"
477 "psubw %%mm1, %%mm4\n"
478 "psubw %%mm3, %%mm5\n"
479 "psubw %%mm4, %%mm0\n"
480 "psubw %%mm5, %%mm2\n"
481 "pxor %%mm3, %%mm3\n"
482 "pxor %%mm1, %%mm1\n"
483 "pcmpgtw %%mm0, %%mm3\n\t"
484 "pcmpgtw %%mm2, %%mm1\n\t"
485 "pxor %%mm3, %%mm0\n"
486 "pxor %%mm1, %%mm2\n"
487 "psubw %%mm3, %%mm0\n"
488 "psubw %%mm1, %%mm2\n"
489 "paddw %%mm0, %%mm2\n"
490 "paddw %%mm2, %%mm6\n"
496 "movq %%mm6, %%mm0\n"
497 "punpcklwd %%mm7,%%mm0\n"
498 "punpckhwd %%mm7,%%mm6\n"
499 "paddd %%mm0, %%mm6\n"
503 "paddd %%mm6,%%mm0\n"
505 : "+r" (pix1), "=r"(tmp)
506 : "r" ((x86_reg)line_size) , "g" (h-2)
511 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
521 "movq %%mm0, %%mm2\n"
522 "movq %%mm1, %%mm3\n"
523 "punpcklbw %%mm7,%%mm0\n"
524 "punpcklbw %%mm7,%%mm1\n"
525 "punpckhbw %%mm7,%%mm2\n"
526 "punpckhbw %%mm7,%%mm3\n"
527 "psubw %%mm1, %%mm0\n"
528 "psubw %%mm3, %%mm2\n"
534 "movq %%mm4, %%mm5\n"
535 "movq %%mm1, %%mm3\n"
536 "punpcklbw %%mm7,%%mm4\n"
537 "punpcklbw %%mm7,%%mm1\n"
538 "punpckhbw %%mm7,%%mm5\n"
539 "punpckhbw %%mm7,%%mm3\n"
540 "psubw %%mm1, %%mm4\n"
541 "psubw %%mm3, %%mm5\n"
542 "psubw %%mm4, %%mm0\n"
543 "psubw %%mm5, %%mm2\n"
544 "pxor %%mm3, %%mm3\n"
545 "pxor %%mm1, %%mm1\n"
546 "pcmpgtw %%mm0, %%mm3\n\t"
547 "pcmpgtw %%mm2, %%mm1\n\t"
548 "pxor %%mm3, %%mm0\n"
549 "pxor %%mm1, %%mm2\n"
550 "psubw %%mm3, %%mm0\n"
551 "psubw %%mm1, %%mm2\n"
552 "paddw %%mm0, %%mm2\n"
553 "paddw %%mm2, %%mm6\n"
560 "movq %%mm0, %%mm2\n"
561 "movq %%mm1, %%mm3\n"
562 "punpcklbw %%mm7,%%mm0\n"
563 "punpcklbw %%mm7,%%mm1\n"
564 "punpckhbw %%mm7,%%mm2\n"
565 "punpckhbw %%mm7,%%mm3\n"
566 "psubw %%mm1, %%mm0\n"
567 "psubw %%mm3, %%mm2\n"
568 "psubw %%mm0, %%mm4\n"
569 "psubw %%mm2, %%mm5\n"
570 "pxor %%mm3, %%mm3\n"
571 "pxor %%mm1, %%mm1\n"
572 "pcmpgtw %%mm4, %%mm3\n\t"
573 "pcmpgtw %%mm5, %%mm1\n\t"
574 "pxor %%mm3, %%mm4\n"
575 "pxor %%mm1, %%mm5\n"
576 "psubw %%mm3, %%mm4\n"
577 "psubw %%mm1, %%mm5\n"
578 "paddw %%mm4, %%mm5\n"
579 "paddw %%mm5, %%mm6\n"
585 "movq %%mm4, %%mm5\n"
586 "movq %%mm1, %%mm3\n"
587 "punpcklbw %%mm7,%%mm4\n"
588 "punpcklbw %%mm7,%%mm1\n"
589 "punpckhbw %%mm7,%%mm5\n"
590 "punpckhbw %%mm7,%%mm3\n"
591 "psubw %%mm1, %%mm4\n"
592 "psubw %%mm3, %%mm5\n"
593 "psubw %%mm4, %%mm0\n"
594 "psubw %%mm5, %%mm2\n"
595 "pxor %%mm3, %%mm3\n"
596 "pxor %%mm1, %%mm1\n"
597 "pcmpgtw %%mm0, %%mm3\n\t"
598 "pcmpgtw %%mm2, %%mm1\n\t"
599 "pxor %%mm3, %%mm0\n"
600 "pxor %%mm1, %%mm2\n"
601 "psubw %%mm3, %%mm0\n"
602 "psubw %%mm1, %%mm2\n"
603 "paddw %%mm0, %%mm2\n"
604 "paddw %%mm2, %%mm6\n"
610 "movq %%mm6, %%mm0\n"
611 "punpcklwd %%mm7,%%mm0\n"
612 "punpckhwd %%mm7,%%mm6\n"
613 "paddd %%mm0, %%mm6\n"
617 "paddd %%mm6,%%mm0\n"
619 : "+r" (pix1), "=r"(tmp)
620 : "r" ((x86_reg)line_size) , "g" (h-2)
622 return tmp + hf_noise8_mmx(pix+8, line_size, h);
625 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
626 MpegEncContext *c = p;
629 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
630 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
631 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
633 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
634 else return score1 + FFABS(score2)*8;
637 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
638 MpegEncContext *c = p;
639 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
640 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
642 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
643 else return score1 + FFABS(score2)*8;
646 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
649 assert( (((int)pix) & 7) == 0);
650 assert((line_size &7) ==0);
652 #define SUM(in0, in1, out0, out1) \
653 "movq (%0), %%mm2\n"\
654 "movq 8(%0), %%mm3\n"\
656 "movq %%mm2, " #out0 "\n"\
657 "movq %%mm3, " #out1 "\n"\
658 "psubusb " #in0 ", %%mm2\n"\
659 "psubusb " #in1 ", %%mm3\n"\
660 "psubusb " #out0 ", " #in0 "\n"\
661 "psubusb " #out1 ", " #in1 "\n"\
662 "por %%mm2, " #in0 "\n"\
663 "por %%mm3, " #in1 "\n"\
664 "movq " #in0 ", %%mm2\n"\
665 "movq " #in1 ", %%mm3\n"\
666 "punpcklbw %%mm7, " #in0 "\n"\
667 "punpcklbw %%mm7, " #in1 "\n"\
668 "punpckhbw %%mm7, %%mm2\n"\
669 "punpckhbw %%mm7, %%mm3\n"\
670 "paddw " #in1 ", " #in0 "\n"\
671 "paddw %%mm3, %%mm2\n"\
672 "paddw %%mm2, " #in0 "\n"\
673 "paddw " #in0 ", %%mm6\n"
686 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
688 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
695 "paddw %%mm6,%%mm0\n"
698 "paddw %%mm6,%%mm0\n"
700 : "+r" (pix), "=r"(tmp)
701 : "r" ((x86_reg)line_size) , "m" (h)
707 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
710 assert( (((int)pix) & 7) == 0);
711 assert((line_size &7) ==0);
713 #define SUM(in0, in1, out0, out1) \
714 "movq (%0), " #out0 "\n"\
715 "movq 8(%0), " #out1 "\n"\
717 "psadbw " #out0 ", " #in0 "\n"\
718 "psadbw " #out1 ", " #in1 "\n"\
719 "paddw " #in1 ", " #in0 "\n"\
720 "paddw " #in0 ", %%mm6\n"
732 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
734 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
740 : "+r" (pix), "=r"(tmp)
741 : "r" ((x86_reg)line_size) , "m" (h)
747 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
750 assert( (((int)pix1) & 7) == 0);
751 assert( (((int)pix2) & 7) == 0);
752 assert((line_size &7) ==0);
754 #define SUM(in0, in1, out0, out1) \
756 "movq (%1)," #out0 "\n"\
757 "movq 8(%0),%%mm3\n"\
758 "movq 8(%1)," #out1 "\n"\
761 "psubb " #out0 ", %%mm2\n"\
762 "psubb " #out1 ", %%mm3\n"\
763 "pxor %%mm7, %%mm2\n"\
764 "pxor %%mm7, %%mm3\n"\
765 "movq %%mm2, " #out0 "\n"\
766 "movq %%mm3, " #out1 "\n"\
767 "psubusb " #in0 ", %%mm2\n"\
768 "psubusb " #in1 ", %%mm3\n"\
769 "psubusb " #out0 ", " #in0 "\n"\
770 "psubusb " #out1 ", " #in1 "\n"\
771 "por %%mm2, " #in0 "\n"\
772 "por %%mm3, " #in1 "\n"\
773 "movq " #in0 ", %%mm2\n"\
774 "movq " #in1 ", %%mm3\n"\
775 "punpcklbw %%mm7, " #in0 "\n"\
776 "punpcklbw %%mm7, " #in1 "\n"\
777 "punpckhbw %%mm7, %%mm2\n"\
778 "punpckhbw %%mm7, %%mm3\n"\
779 "paddw " #in1 ", " #in0 "\n"\
780 "paddw %%mm3, %%mm2\n"\
781 "paddw %%mm2, " #in0 "\n"\
782 "paddw " #in0 ", %%mm6\n"
788 "pcmpeqw %%mm7,%%mm7\n"
790 "packsswb %%mm7, %%mm7\n"
797 "psubb %%mm2, %%mm0\n"
798 "psubb %%mm3, %%mm1\n"
799 "pxor %%mm7, %%mm0\n"
800 "pxor %%mm7, %%mm1\n"
804 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
806 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
813 "paddw %%mm6,%%mm0\n"
816 "paddw %%mm6,%%mm0\n"
818 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
819 : "r" ((x86_reg)line_size) , "m" (h)
825 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
828 assert( (((int)pix1) & 7) == 0);
829 assert( (((int)pix2) & 7) == 0);
830 assert((line_size &7) ==0);
832 #define SUM(in0, in1, out0, out1) \
833 "movq (%0)," #out0 "\n"\
835 "movq 8(%0)," #out1 "\n"\
836 "movq 8(%1),%%mm3\n"\
839 "psubb %%mm2, " #out0 "\n"\
840 "psubb %%mm3, " #out1 "\n"\
841 "pxor %%mm7, " #out0 "\n"\
842 "pxor %%mm7, " #out1 "\n"\
843 "psadbw " #out0 ", " #in0 "\n"\
844 "psadbw " #out1 ", " #in1 "\n"\
845 "paddw " #in1 ", " #in0 "\n"\
846 "paddw " #in0 ", %%mm6\n"
851 "pcmpeqw %%mm7,%%mm7\n"
853 "packsswb %%mm7, %%mm7\n"
860 "psubb %%mm2, %%mm0\n"
861 "psubb %%mm3, %%mm1\n"
862 "pxor %%mm7, %%mm0\n"
863 "pxor %%mm7, %%mm1\n"
867 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
869 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
875 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
876 : "r" ((x86_reg)line_size) , "m" (h)
882 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
886 "movq (%2, %0), %%mm0 \n\t"
887 "movq (%1, %0), %%mm1 \n\t"
888 "psubb %%mm0, %%mm1 \n\t"
889 "movq %%mm1, (%3, %0) \n\t"
890 "movq 8(%2, %0), %%mm0 \n\t"
891 "movq 8(%1, %0), %%mm1 \n\t"
892 "psubb %%mm0, %%mm1 \n\t"
893 "movq %%mm1, 8(%3, %0) \n\t"
898 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
901 dst[i+0] = src1[i+0]-src2[i+0];
904 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
910 "movq -1(%1, %0), %%mm0 \n\t" // LT
911 "movq (%1, %0), %%mm1 \n\t" // T
912 "movq -1(%2, %0), %%mm2 \n\t" // L
913 "movq (%2, %0), %%mm3 \n\t" // X
914 "movq %%mm2, %%mm4 \n\t" // L
915 "psubb %%mm0, %%mm2 \n\t"
916 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
917 "movq %%mm4, %%mm5 \n\t" // L
918 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
919 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
920 "pminub %%mm2, %%mm4 \n\t"
921 "pmaxub %%mm1, %%mm4 \n\t"
922 "psubb %%mm4, %%mm3 \n\t" // dst - pred
923 "movq %%mm3, (%3, %0) \n\t"
928 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
934 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
936 *left_top= src1[w-1];
940 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
941 "mov"#m" "#p1", "#a" \n\t"\
942 "mov"#m" "#p2", "#t" \n\t"\
943 "punpcklbw "#a", "#t" \n\t"\
944 "punpcklbw "#a", "#a" \n\t"\
945 "psubw "#t", "#a" \n\t"\
947 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
948 uint8_t *p1b=p1, *p2b=p2;\
950 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
951 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
952 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
955 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
956 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
957 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
958 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
959 "mov"#m1" "#mm"0, %0 \n\t"\
960 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
961 "mov"#m1" %0, "#mm"0 \n\t"\
962 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
963 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
966 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
968 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
969 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
971 #define LBUTTERFLY2(a1,b1,a2,b2)\
972 "paddw " #b1 ", " #a1 " \n\t"\
973 "paddw " #b2 ", " #a2 " \n\t"\
974 "paddw " #b1 ", " #b1 " \n\t"\
975 "paddw " #b2 ", " #b2 " \n\t"\
976 "psubw " #a1 ", " #b1 " \n\t"\
977 "psubw " #a2 ", " #b2 " \n\t"
979 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
980 LBUTTERFLY2(m0, m1, m2, m3)\
981 LBUTTERFLY2(m4, m5, m6, m7)\
982 LBUTTERFLY2(m0, m2, m1, m3)\
983 LBUTTERFLY2(m4, m6, m5, m7)\
984 LBUTTERFLY2(m0, m4, m1, m5)\
985 LBUTTERFLY2(m2, m6, m3, m7)\
987 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
989 #define MMABS_MMX(a,z)\
990 "pxor " #z ", " #z " \n\t"\
991 "pcmpgtw " #a ", " #z " \n\t"\
992 "pxor " #z ", " #a " \n\t"\
993 "psubw " #z ", " #a " \n\t"
995 #define MMABS_MMX2(a,z)\
996 "pxor " #z ", " #z " \n\t"\
997 "psubw " #a ", " #z " \n\t"\
998 "pmaxsw " #z ", " #a " \n\t"
1000 #define MMABS_SSSE3(a,z)\
1001 "pabsw " #a ", " #a " \n\t"
1003 #define MMABS_SUM(a,z, sum)\
1005 "paddusw " #a ", " #sum " \n\t"
1007 #define MMABS_SUM_8x8_NOSPILL\
1008 MMABS(%%xmm0, %%xmm8)\
1009 MMABS(%%xmm1, %%xmm9)\
1010 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1011 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1012 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1013 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1014 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1015 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1016 "paddusw %%xmm1, %%xmm0 \n\t"
1019 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1021 #define MMABS_SUM_8x8_SSE2\
1022 "movdqa %%xmm7, (%1) \n\t"\
1023 MMABS(%%xmm0, %%xmm7)\
1024 MMABS(%%xmm1, %%xmm7)\
1025 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1026 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1027 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1028 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1029 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1030 "movdqa (%1), %%xmm2 \n\t"\
1031 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1032 "paddusw %%xmm1, %%xmm0 \n\t"
1035 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1036 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1037 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1038 #define HSUM_MMX(a, t, dst)\
1039 "movq "#a", "#t" \n\t"\
1040 "psrlq $32, "#a" \n\t"\
1041 "paddusw "#t", "#a" \n\t"\
1042 "movq "#a", "#t" \n\t"\
1043 "psrlq $16, "#a" \n\t"\
1044 "paddusw "#t", "#a" \n\t"\
1045 "movd "#a", "#dst" \n\t"\
1047 #define HSUM_MMX2(a, t, dst)\
1048 "pshufw $0x0E, "#a", "#t" \n\t"\
1049 "paddusw "#t", "#a" \n\t"\
1050 "pshufw $0x01, "#a", "#t" \n\t"\
1051 "paddusw "#t", "#a" \n\t"\
1052 "movd "#a", "#dst" \n\t"\
1054 #define HSUM_SSE2(a, t, dst)\
1055 "movhlps "#a", "#t" \n\t"\
1056 "paddusw "#t", "#a" \n\t"\
1057 "pshuflw $0x0E, "#a", "#t" \n\t"\
1058 "paddusw "#t", "#a" \n\t"\
1059 "pshuflw $0x01, "#a", "#t" \n\t"\
1060 "paddusw "#t", "#a" \n\t"\
1061 "movd "#a", "#dst" \n\t"\
1063 #define HADAMARD8_DIFF_MMX(cpu) \
1064 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1065 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1070 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1075 "movq %%mm7, 96(%1) \n\t"\
1077 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1080 "movq 96(%1), %%mm7 \n\t"\
1081 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1088 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1093 "movq %%mm7, 96(%1) \n\t"\
1095 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1096 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1098 "movq 96(%1), %%mm7 \n\t"\
1099 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1100 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1101 "movq %%mm6, %%mm7 \n\t"\
1102 "movq %%mm0, %%mm6 \n\t"\
1104 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1107 "movq %%mm7, 64(%1) \n\t"\
1108 MMABS(%%mm0, %%mm7)\
1109 MMABS(%%mm1, %%mm7)\
1110 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1111 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1112 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1113 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1114 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1115 "movq 64(%1), %%mm2 \n\t"\
1116 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1117 "paddusw %%mm1, %%mm0 \n\t"\
1118 "movq %%mm0, 64(%1) \n\t"\
1120 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1121 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1124 "movq %%mm7, (%1) \n\t"\
1125 MMABS(%%mm0, %%mm7)\
1126 MMABS(%%mm1, %%mm7)\
1127 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1128 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1129 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1130 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1131 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1132 "movq (%1), %%mm2 \n\t"\
1133 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1134 "paddusw 64(%1), %%mm0 \n\t"\
1135 "paddusw %%mm1, %%mm0 \n\t"\
1137 HSUM(%%mm0, %%mm1, %0)\
1144 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1146 #define HADAMARD8_DIFF_SSE2(cpu) \
1147 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1148 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1153 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1156 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1157 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1158 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1160 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1166 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1168 #define MMABS(a,z) MMABS_MMX(a,z)
1169 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1170 HADAMARD8_DIFF_MMX(mmx)
1174 #define MMABS(a,z) MMABS_MMX2(a,z)
1175 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1176 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1177 HADAMARD8_DIFF_MMX(mmx2)
1178 HADAMARD8_DIFF_SSE2(sse2)
1180 #undef MMABS_SUM_8x8
1184 #define MMABS(a,z) MMABS_SSSE3(a,z)
1185 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1186 HADAMARD8_DIFF_SSE2(ssse3)
1188 #undef MMABS_SUM_8x8
1191 #define DCT_SAD4(m,mm,o)\
1192 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1193 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1194 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1195 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1196 MMABS_SUM(mm##2, mm##6, mm##0)\
1197 MMABS_SUM(mm##3, mm##7, mm##1)\
1198 MMABS_SUM(mm##4, mm##6, mm##0)\
1199 MMABS_SUM(mm##5, mm##7, mm##1)\
1201 #define DCT_SAD_MMX\
1202 "pxor %%mm0, %%mm0 \n\t"\
1203 "pxor %%mm1, %%mm1 \n\t"\
1204 DCT_SAD4(q, %%mm, 0)\
1205 DCT_SAD4(q, %%mm, 8)\
1206 DCT_SAD4(q, %%mm, 64)\
1207 DCT_SAD4(q, %%mm, 72)\
1208 "paddusw %%mm1, %%mm0 \n\t"\
1209 HSUM(%%mm0, %%mm1, %0)
1211 #define DCT_SAD_SSE2\
1212 "pxor %%xmm0, %%xmm0 \n\t"\
1213 "pxor %%xmm1, %%xmm1 \n\t"\
1214 DCT_SAD4(dqa, %%xmm, 0)\
1215 DCT_SAD4(dqa, %%xmm, 64)\
1216 "paddusw %%xmm1, %%xmm0 \n\t"\
1217 HSUM(%%xmm0, %%xmm1, %0)
1219 #define DCT_SAD_FUNC(cpu) \
1220 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1230 #define DCT_SAD DCT_SAD_MMX
1231 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1232 #define MMABS(a,z) MMABS_MMX(a,z)
1237 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1238 #define MMABS(a,z) MMABS_MMX2(a,z)
1243 #define DCT_SAD DCT_SAD_SSE2
1244 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1249 #define MMABS(a,z) MMABS_SSSE3(a,z)
1256 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1260 "pxor %%mm4, %%mm4 \n"
1263 "movq (%2,%0), %%mm2 \n"
1264 "movq (%3,%0,2), %%mm0 \n"
1265 "movq 8(%3,%0,2), %%mm1 \n"
1266 "punpckhbw %%mm2, %%mm3 \n"
1267 "punpcklbw %%mm2, %%mm2 \n"
1268 "psraw $8, %%mm3 \n"
1269 "psraw $8, %%mm2 \n"
1270 "psubw %%mm3, %%mm1 \n"
1271 "psubw %%mm2, %%mm0 \n"
1272 "pmaddwd %%mm1, %%mm1 \n"
1273 "pmaddwd %%mm0, %%mm0 \n"
1274 "paddd %%mm1, %%mm4 \n"
1275 "paddd %%mm0, %%mm4 \n"
1277 "movq %%mm4, %%mm3 \n"
1278 "psrlq $32, %%mm3 \n"
1279 "paddd %%mm3, %%mm4 \n"
1282 :"r"(pix1), "r"(pix2)
1287 #define PHADDD(a, t)\
1288 "movq "#a", "#t" \n\t"\
1289 "psrlq $32, "#a" \n\t"\
1290 "paddd "#t", "#a" \n\t"
1292 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1293 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1294 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1296 #define PMULHRW(x, y, s, o)\
1297 "pmulhw " #s ", "#x " \n\t"\
1298 "pmulhw " #s ", "#y " \n\t"\
1299 "paddw " #o ", "#x " \n\t"\
1300 "paddw " #o ", "#y " \n\t"\
1301 "psraw $1, "#x " \n\t"\
1302 "psraw $1, "#y " \n\t"
1303 #define DEF(x) x ## _mmx
1304 #define SET_RND MOVQ_WONE
1305 #define SCALE_OFFSET 1
1307 #include "dsputil_mmx_qns_template.c"
1314 #define DEF(x) x ## _3dnow
1316 #define SCALE_OFFSET 0
1317 #define PMULHRW(x, y, s, o)\
1318 "pmulhrw " #s ", "#x " \n\t"\
1319 "pmulhrw " #s ", "#y " \n\t"
1321 #include "dsputil_mmx_qns_template.c"
1330 #define DEF(x) x ## _ssse3
1332 #define SCALE_OFFSET -1
1333 #define PHADDD(a, t)\
1334 "pshufw $0x0E, "#a", "#t" \n\t"\
1335 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1336 #define PMULHRW(x, y, s, o)\
1337 "pmulhrsw " #s ", "#x " \n\t"\
1338 "pmulhrsw " #s ", "#y " \n\t"
1340 #include "dsputil_mmx_qns_template.c"
1351 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1355 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1357 if (mm_flags & FF_MM_MMX) {
1358 const int dct_algo = avctx->dct_algo;
1359 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1360 if(mm_flags & FF_MM_SSE2){
1361 c->fdct = ff_fdct_sse2;
1362 }else if(mm_flags & FF_MM_MMXEXT){
1363 c->fdct = ff_fdct_mmx2;
1365 c->fdct = ff_fdct_mmx;
1369 c->get_pixels = get_pixels_mmx;
1370 c->diff_pixels = diff_pixels_mmx;
1371 c->pix_sum = pix_sum16_mmx;
1373 c->diff_bytes= diff_bytes_mmx;
1374 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1376 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1377 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1379 c->pix_norm1 = pix_norm1_mmx;
1380 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
1381 c->sse[1] = sse8_mmx;
1382 c->vsad[4]= vsad_intra16_mmx;
1384 c->nsse[0] = nsse16_mmx;
1385 c->nsse[1] = nsse8_mmx;
1386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387 c->vsad[0] = vsad16_mmx;
1390 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1391 c->try_8x8basis= try_8x8basis_mmx;
1393 c->add_8x8basis= add_8x8basis_mmx;
1395 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1398 if (mm_flags & FF_MM_MMXEXT) {
1399 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1400 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1401 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1402 c->vsad[4]= vsad_intra16_mmx2;
1404 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1405 c->vsad[0] = vsad16_mmx2;
1408 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1411 if(mm_flags & FF_MM_SSE2){
1412 c->get_pixels = get_pixels_sse2;
1413 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1414 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1415 c->hadamard8_diff[1]= hadamard8_diff_sse2;
1416 if (CONFIG_FLAC_ENCODER)
1417 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1421 if(mm_flags & FF_MM_SSSE3){
1422 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1423 c->try_8x8basis= try_8x8basis_ssse3;
1425 c->add_8x8basis= add_8x8basis_ssse3;
1426 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1427 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1428 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1432 if(mm_flags & FF_MM_3DNOW){
1433 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1434 c->try_8x8basis= try_8x8basis_3dnow;
1436 c->add_8x8basis= add_8x8basis_3dnow;
1440 dsputil_init_pix_mmx(c, avctx);