2 * SIMD-optimized motion estimation
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
34 static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
35 ptrdiff_t stride, int h)
42 "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */
43 "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */
45 "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */
46 "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */
47 "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */
48 "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */
50 /* todo: mm1-mm2, mm3-mm4 */
51 /* algo: subtract mm1 from mm2 with saturation and vice versa */
52 /* OR the results to get absolute difference */
53 "movq %%mm1, %%mm5 \n"
54 "movq %%mm3, %%mm6 \n"
55 "psubusb %%mm2, %%mm1 \n"
56 "psubusb %%mm4, %%mm3 \n"
57 "psubusb %%mm5, %%mm2 \n"
58 "psubusb %%mm6, %%mm4 \n"
63 /* now convert to 16-bit vectors so we can square them */
64 "movq %%mm2, %%mm1 \n"
65 "movq %%mm4, %%mm3 \n"
67 "punpckhbw %%mm0, %%mm2 \n"
68 "punpckhbw %%mm0, %%mm4 \n"
69 "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */
70 "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */
72 "pmaddwd %%mm2, %%mm2 \n"
73 "pmaddwd %%mm4, %%mm4 \n"
74 "pmaddwd %%mm1, %%mm1 \n"
75 "pmaddwd %%mm3, %%mm3 \n"
77 "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * stride */
78 "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * stride */
80 "paddd %%mm2, %%mm1 \n"
81 "paddd %%mm4, %%mm3 \n"
82 "paddd %%mm1, %%mm7 \n"
83 "paddd %%mm3, %%mm7 \n"
88 "movq %%mm7, %%mm1 \n"
89 "psrlq $32, %%mm7 \n" /* shift hi dword to lo */
90 "paddd %%mm7, %%mm1 \n"
92 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
93 : "r" (stride), "m" (h)
99 static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
100 ptrdiff_t stride, int h)
106 "pxor %%mm0, %%mm0\n" /* mm0 = 0 */
107 "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */
109 "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */
110 "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */
111 "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */
112 "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */
114 /* todo: mm1-mm2, mm3-mm4 */
115 /* algo: subtract mm1 from mm2 with saturation and vice versa */
116 /* OR the results to get absolute difference */
117 "movq %%mm1, %%mm5\n"
118 "movq %%mm3, %%mm6\n"
119 "psubusb %%mm2, %%mm1\n"
120 "psubusb %%mm4, %%mm3\n"
121 "psubusb %%mm5, %%mm2\n"
122 "psubusb %%mm6, %%mm4\n"
127 /* now convert to 16-bit vectors so we can square them */
128 "movq %%mm2, %%mm1\n"
129 "movq %%mm4, %%mm3\n"
131 "punpckhbw %%mm0, %%mm2\n"
132 "punpckhbw %%mm0, %%mm4\n"
133 "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
134 "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
136 "pmaddwd %%mm2, %%mm2\n"
137 "pmaddwd %%mm4, %%mm4\n"
138 "pmaddwd %%mm1, %%mm1\n"
139 "pmaddwd %%mm3, %%mm3\n"
144 "paddd %%mm2, %%mm1\n"
145 "paddd %%mm4, %%mm3\n"
146 "paddd %%mm1, %%mm7\n"
147 "paddd %%mm3, %%mm7\n"
152 "movq %%mm7, %%mm1\n"
153 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
154 "paddd %%mm7, %%mm1\n"
156 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
157 : "r" (stride), "m" (h)
163 static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
169 "pxor %%mm7, %%mm7\n"
170 "pxor %%mm6, %%mm6\n"
173 "movq %%mm0, %%mm1\n"
177 "movq %%mm0, %%mm2\n"
178 "movq %%mm1, %%mm3\n"
179 "punpcklbw %%mm7, %%mm0\n"
180 "punpcklbw %%mm7, %%mm1\n"
181 "punpckhbw %%mm7, %%mm2\n"
182 "punpckhbw %%mm7, %%mm3\n"
183 "psubw %%mm1, %%mm0\n"
184 "psubw %%mm3, %%mm2\n"
189 "movq %%mm4, %%mm1\n"
193 "movq %%mm4, %%mm5\n"
194 "movq %%mm1, %%mm3\n"
195 "punpcklbw %%mm7, %%mm4\n"
196 "punpcklbw %%mm7, %%mm1\n"
197 "punpckhbw %%mm7, %%mm5\n"
198 "punpckhbw %%mm7, %%mm3\n"
199 "psubw %%mm1, %%mm4\n"
200 "psubw %%mm3, %%mm5\n"
201 "psubw %%mm4, %%mm0\n"
202 "psubw %%mm5, %%mm2\n"
203 "pxor %%mm3, %%mm3\n"
204 "pxor %%mm1, %%mm1\n"
205 "pcmpgtw %%mm0, %%mm3\n\t"
206 "pcmpgtw %%mm2, %%mm1\n\t"
207 "pxor %%mm3, %%mm0\n"
208 "pxor %%mm1, %%mm2\n"
209 "psubw %%mm3, %%mm0\n"
210 "psubw %%mm1, %%mm2\n"
211 "paddw %%mm0, %%mm2\n"
212 "paddw %%mm2, %%mm6\n"
218 "movq %%mm0, %%mm1\n"
222 "movq %%mm0, %%mm2\n"
223 "movq %%mm1, %%mm3\n"
224 "punpcklbw %%mm7, %%mm0\n"
225 "punpcklbw %%mm7, %%mm1\n"
226 "punpckhbw %%mm7, %%mm2\n"
227 "punpckhbw %%mm7, %%mm3\n"
228 "psubw %%mm1, %%mm0\n"
229 "psubw %%mm3, %%mm2\n"
230 "psubw %%mm0, %%mm4\n"
231 "psubw %%mm2, %%mm5\n"
232 "pxor %%mm3, %%mm3\n"
233 "pxor %%mm1, %%mm1\n"
234 "pcmpgtw %%mm4, %%mm3\n\t"
235 "pcmpgtw %%mm5, %%mm1\n\t"
236 "pxor %%mm3, %%mm4\n"
237 "pxor %%mm1, %%mm5\n"
238 "psubw %%mm3, %%mm4\n"
239 "psubw %%mm1, %%mm5\n"
240 "paddw %%mm4, %%mm5\n"
241 "paddw %%mm5, %%mm6\n"
246 "movq %%mm4, %%mm1\n"
250 "movq %%mm4, %%mm5\n"
251 "movq %%mm1, %%mm3\n"
252 "punpcklbw %%mm7, %%mm4\n"
253 "punpcklbw %%mm7, %%mm1\n"
254 "punpckhbw %%mm7, %%mm5\n"
255 "punpckhbw %%mm7, %%mm3\n"
256 "psubw %%mm1, %%mm4\n"
257 "psubw %%mm3, %%mm5\n"
258 "psubw %%mm4, %%mm0\n"
259 "psubw %%mm5, %%mm2\n"
260 "pxor %%mm3, %%mm3\n"
261 "pxor %%mm1, %%mm1\n"
262 "pcmpgtw %%mm0, %%mm3\n\t"
263 "pcmpgtw %%mm2, %%mm1\n\t"
264 "pxor %%mm3, %%mm0\n"
265 "pxor %%mm1, %%mm2\n"
266 "psubw %%mm3, %%mm0\n"
267 "psubw %%mm1, %%mm2\n"
268 "paddw %%mm0, %%mm2\n"
269 "paddw %%mm2, %%mm6\n"
275 "movq %%mm6, %%mm0\n"
276 "punpcklwd %%mm7, %%mm0\n"
277 "punpckhwd %%mm7, %%mm6\n"
278 "paddd %%mm0, %%mm6\n"
280 "movq %%mm6, %%mm0\n"
282 "paddd %%mm6, %%mm0\n"
284 : "+r" (pix1), "=r" (tmp)
285 : "r" (stride), "g" (h - 2)
291 static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
298 "pxor %%mm7, %%mm7\n"
299 "pxor %%mm6, %%mm6\n"
302 "movq 1(%0), %%mm1\n"
303 "movq %%mm0, %%mm2\n"
304 "movq %%mm1, %%mm3\n"
305 "punpcklbw %%mm7, %%mm0\n"
306 "punpcklbw %%mm7, %%mm1\n"
307 "punpckhbw %%mm7, %%mm2\n"
308 "punpckhbw %%mm7, %%mm3\n"
309 "psubw %%mm1, %%mm0\n"
310 "psubw %%mm3, %%mm2\n"
315 "movq 1(%0), %%mm1\n"
316 "movq %%mm4, %%mm5\n"
317 "movq %%mm1, %%mm3\n"
318 "punpcklbw %%mm7, %%mm4\n"
319 "punpcklbw %%mm7, %%mm1\n"
320 "punpckhbw %%mm7, %%mm5\n"
321 "punpckhbw %%mm7, %%mm3\n"
322 "psubw %%mm1, %%mm4\n"
323 "psubw %%mm3, %%mm5\n"
324 "psubw %%mm4, %%mm0\n"
325 "psubw %%mm5, %%mm2\n"
326 "pxor %%mm3, %%mm3\n"
327 "pxor %%mm1, %%mm1\n"
328 "pcmpgtw %%mm0, %%mm3\n\t"
329 "pcmpgtw %%mm2, %%mm1\n\t"
330 "pxor %%mm3, %%mm0\n"
331 "pxor %%mm1, %%mm2\n"
332 "psubw %%mm3, %%mm0\n"
333 "psubw %%mm1, %%mm2\n"
334 "paddw %%mm0, %%mm2\n"
335 "paddw %%mm2, %%mm6\n"
341 "movq 1(%0), %%mm1\n"
342 "movq %%mm0, %%mm2\n"
343 "movq %%mm1, %%mm3\n"
344 "punpcklbw %%mm7, %%mm0\n"
345 "punpcklbw %%mm7, %%mm1\n"
346 "punpckhbw %%mm7, %%mm2\n"
347 "punpckhbw %%mm7, %%mm3\n"
348 "psubw %%mm1, %%mm0\n"
349 "psubw %%mm3, %%mm2\n"
350 "psubw %%mm0, %%mm4\n"
351 "psubw %%mm2, %%mm5\n"
352 "pxor %%mm3, %%mm3\n"
353 "pxor %%mm1, %%mm1\n"
354 "pcmpgtw %%mm4, %%mm3\n\t"
355 "pcmpgtw %%mm5, %%mm1\n\t"
356 "pxor %%mm3, %%mm4\n"
357 "pxor %%mm1, %%mm5\n"
358 "psubw %%mm3, %%mm4\n"
359 "psubw %%mm1, %%mm5\n"
360 "paddw %%mm4, %%mm5\n"
361 "paddw %%mm5, %%mm6\n"
366 "movq 1(%0), %%mm1\n"
367 "movq %%mm4, %%mm5\n"
368 "movq %%mm1, %%mm3\n"
369 "punpcklbw %%mm7, %%mm4\n"
370 "punpcklbw %%mm7, %%mm1\n"
371 "punpckhbw %%mm7, %%mm5\n"
372 "punpckhbw %%mm7, %%mm3\n"
373 "psubw %%mm1, %%mm4\n"
374 "psubw %%mm3, %%mm5\n"
375 "psubw %%mm4, %%mm0\n"
376 "psubw %%mm5, %%mm2\n"
377 "pxor %%mm3, %%mm3\n"
378 "pxor %%mm1, %%mm1\n"
379 "pcmpgtw %%mm0, %%mm3\n\t"
380 "pcmpgtw %%mm2, %%mm1\n\t"
381 "pxor %%mm3, %%mm0\n"
382 "pxor %%mm1, %%mm2\n"
383 "psubw %%mm3, %%mm0\n"
384 "psubw %%mm1, %%mm2\n"
385 "paddw %%mm0, %%mm2\n"
386 "paddw %%mm2, %%mm6\n"
392 "movq %%mm6, %%mm0\n"
393 "punpcklwd %%mm7, %%mm0\n"
394 "punpckhwd %%mm7, %%mm6\n"
395 "paddd %%mm0, %%mm6\n"
397 "movq %%mm6, %%mm0\n"
399 "paddd %%mm6, %%mm0\n"
401 : "+r" (pix1), "=r" (tmp)
402 : "r" (stride), "g" (h - 2)
405 return tmp + hf_noise8_mmx(pix + 8, stride, h);
408 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
409 ptrdiff_t stride, int h)
414 score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
416 score1 = sse16_mmx(c, pix1, pix2, stride, h);
417 score2 = hf_noise16_mmx(pix1, stride, h) -
418 hf_noise16_mmx(pix2, stride, h);
421 return score1 + FFABS(score2) * c->avctx->nsse_weight;
423 return score1 + FFABS(score2) * 8;
426 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
427 ptrdiff_t stride, int h)
429 int score1 = sse8_mmx(c, pix1, pix2, stride, h);
430 int score2 = hf_noise8_mmx(pix1, stride, h) -
431 hf_noise8_mmx(pix2, stride, h);
434 return score1 + FFABS(score2) * c->avctx->nsse_weight;
436 return score1 + FFABS(score2) * 8;
439 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
440 ptrdiff_t stride, int h)
444 assert((((int) pix) & 7) == 0);
445 assert((stride & 7) == 0);
447 #define SUM(in0, in1, out0, out1) \
448 "movq (%0), %%mm2\n" \
449 "movq 8(%0), %%mm3\n" \
451 "movq %%mm2, " #out0 "\n" \
452 "movq %%mm3, " #out1 "\n" \
453 "psubusb " #in0 ", %%mm2\n" \
454 "psubusb " #in1 ", %%mm3\n" \
455 "psubusb " #out0 ", " #in0 "\n" \
456 "psubusb " #out1 ", " #in1 "\n" \
457 "por %%mm2, " #in0 "\n" \
458 "por %%mm3, " #in1 "\n" \
459 "movq " #in0 ", %%mm2\n" \
460 "movq " #in1 ", %%mm3\n" \
461 "punpcklbw %%mm7, " #in0 "\n" \
462 "punpcklbw %%mm7, " #in1 "\n" \
463 "punpckhbw %%mm7, %%mm2\n" \
464 "punpckhbw %%mm7, %%mm3\n" \
465 "paddw " #in1 ", " #in0 "\n" \
466 "paddw %%mm3, %%mm2\n" \
467 "paddw %%mm2, " #in0 "\n" \
468 "paddw " #in0 ", %%mm6\n"
473 "pxor %%mm6, %%mm6\n"
474 "pxor %%mm7, %%mm7\n"
476 "movq 8(%0), %%mm1\n"
481 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
483 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
488 "movq %%mm6, %%mm0\n"
490 "paddw %%mm6, %%mm0\n"
491 "movq %%mm0, %%mm6\n"
493 "paddw %%mm6, %%mm0\n"
495 : "+r" (pix), "=r" (tmp)
496 : "r" (stride), "m" (h)
503 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
504 ptrdiff_t stride, int h)
508 assert((((int) pix) & 7) == 0);
509 assert((stride & 7) == 0);
511 #define SUM(in0, in1, out0, out1) \
512 "movq (%0), " #out0 "\n" \
513 "movq 8(%0), " #out1 "\n" \
515 "psadbw " #out0 ", " #in0 "\n" \
516 "psadbw " #out1 ", " #in1 "\n" \
517 "paddw " #in1 ", " #in0 "\n" \
518 "paddw " #in0 ", %%mm6\n"
522 "pxor %%mm6, %%mm6\n"
523 "pxor %%mm7, %%mm7\n"
525 "movq 8(%0), %%mm1\n"
530 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
532 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
538 : "+r" (pix), "=r" (tmp)
539 : "r" (stride), "m" (h)
546 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
547 ptrdiff_t stride, int h)
551 assert((((int) pix1) & 7) == 0);
552 assert((((int) pix2) & 7) == 0);
553 assert((stride & 7) == 0);
555 #define SUM(in0, in1, out0, out1) \
556 "movq (%0), %%mm2\n" \
557 "movq (%1), " #out0 "\n" \
558 "movq 8(%0), %%mm3\n" \
559 "movq 8(%1), " #out1 "\n" \
562 "psubb " #out0 ", %%mm2\n" \
563 "psubb " #out1 ", %%mm3\n" \
564 "pxor %%mm7, %%mm2\n" \
565 "pxor %%mm7, %%mm3\n" \
566 "movq %%mm2, " #out0 "\n" \
567 "movq %%mm3, " #out1 "\n" \
568 "psubusb " #in0 ", %%mm2\n" \
569 "psubusb " #in1 ", %%mm3\n" \
570 "psubusb " #out0 ", " #in0 "\n" \
571 "psubusb " #out1 ", " #in1 "\n" \
572 "por %%mm2, " #in0 "\n" \
573 "por %%mm3, " #in1 "\n" \
574 "movq " #in0 ", %%mm2\n" \
575 "movq " #in1 ", %%mm3\n" \
576 "punpcklbw %%mm7, " #in0 "\n" \
577 "punpcklbw %%mm7, " #in1 "\n" \
578 "punpckhbw %%mm7, %%mm2\n" \
579 "punpckhbw %%mm7, %%mm3\n" \
580 "paddw " #in1 ", " #in0 "\n" \
581 "paddw %%mm3, %%mm2\n" \
582 "paddw %%mm2, " #in0 "\n" \
583 "paddw " #in0 ", %%mm6\n"
588 "pxor %%mm6, %%mm6\n"
589 "pcmpeqw %%mm7, %%mm7\n"
591 "packsswb %%mm7, %%mm7\n"
594 "movq 8(%0), %%mm1\n"
595 "movq 8(%1), %%mm3\n"
598 "psubb %%mm2, %%mm0\n"
599 "psubb %%mm3, %%mm1\n"
600 "pxor %%mm7, %%mm0\n"
601 "pxor %%mm7, %%mm1\n"
605 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
607 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
612 "movq %%mm6, %%mm0\n"
614 "paddw %%mm6, %%mm0\n"
615 "movq %%mm0, %%mm6\n"
617 "paddw %%mm6, %%mm0\n"
619 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
620 : "r" (stride), "m" (h)
627 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
628 ptrdiff_t stride, int h)
632 assert((((int) pix1) & 7) == 0);
633 assert((((int) pix2) & 7) == 0);
634 assert((stride & 7) == 0);
636 #define SUM(in0, in1, out0, out1) \
637 "movq (%0), " #out0 "\n" \
638 "movq (%1), %%mm2\n" \
639 "movq 8(%0), " #out1 "\n" \
640 "movq 8(%1), %%mm3\n" \
643 "psubb %%mm2, " #out0 "\n" \
644 "psubb %%mm3, " #out1 "\n" \
645 "pxor %%mm7, " #out0 "\n" \
646 "pxor %%mm7, " #out1 "\n" \
647 "psadbw " #out0 ", " #in0 "\n" \
648 "psadbw " #out1 ", " #in1 "\n" \
649 "paddw " #in1 ", " #in0 "\n" \
650 "paddw " #in0 ", %%mm6\n "
654 "pxor %%mm6, %%mm6\n"
655 "pcmpeqw %%mm7, %%mm7\n"
657 "packsswb %%mm7, %%mm7\n"
660 "movq 8(%0), %%mm1\n"
661 "movq 8(%1), %%mm3\n"
664 "psubb %%mm2, %%mm0\n"
665 "psubb %%mm3, %%mm1\n"
666 "pxor %%mm7, %%mm0\n"
667 "pxor %%mm7, %%mm1\n"
671 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
673 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
679 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
680 : "r" (stride), "m" (h)
687 #define MMABS_MMX(a,z) \
688 "pxor " #z ", " #z " \n\t" \
689 "pcmpgtw " #a ", " #z " \n\t" \
690 "pxor " #z ", " #a " \n\t" \
691 "psubw " #z ", " #a " \n\t"
693 #define MMABS_MMXEXT(a, z) \
694 "pxor " #z ", " #z " \n\t" \
695 "psubw " #a ", " #z " \n\t" \
696 "pmaxsw " #z ", " #a " \n\t"
698 #define MMABS_SSSE3(a,z) \
699 "pabsw " #a ", " #a " \n\t"
701 #define MMABS_SUM(a,z, sum) \
703 "paddusw " #a ", " #sum " \n\t"
705 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
706 * up to about 100k on extreme inputs. But that's very unlikely to occur in
707 * natural video, and it's even more unlikely to not have any alternative
708 * mvs/modes with lower cost. */
709 #define HSUM_MMX(a, t, dst) \
710 "movq " #a ", " #t " \n\t" \
711 "psrlq $32, " #a " \n\t" \
712 "paddusw " #t ", " #a " \n\t" \
713 "movq " #a ", " #t " \n\t" \
714 "psrlq $16, " #a " \n\t" \
715 "paddusw " #t ", " #a " \n\t" \
716 "movd " #a ", " #dst " \n\t" \
718 #define HSUM_MMXEXT(a, t, dst) \
719 "pshufw $0x0E, " #a ", " #t " \n\t" \
720 "paddusw " #t ", " #a " \n\t" \
721 "pshufw $0x01, " #a ", " #t " \n\t" \
722 "paddusw " #t ", " #a " \n\t" \
723 "movd " #a ", " #dst " \n\t" \
725 #define HSUM_SSE2(a, t, dst) \
726 "movhlps " #a ", " #t " \n\t" \
727 "paddusw " #t ", " #a " \n\t" \
728 "pshuflw $0x0E, " #a ", " #t " \n\t" \
729 "paddusw " #t ", " #a " \n\t" \
730 "pshuflw $0x01, " #a ", " #t " \n\t" \
731 "paddusw " #t ", " #a " \n\t" \
732 "movd " #a ", " #dst " \n\t" \
734 #define DCT_SAD4(m, mm, o) \
735 "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
736 "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
737 "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
738 "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
739 MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
740 MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
741 MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
742 MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
744 #define DCT_SAD_MMX \
745 "pxor %%mm0, %%mm0 \n\t" \
746 "pxor %%mm1, %%mm1 \n\t" \
747 DCT_SAD4(q, %%mm, 0) \
748 DCT_SAD4(q, %%mm, 8) \
749 DCT_SAD4(q, %%mm, 64) \
750 DCT_SAD4(q, %%mm, 72) \
751 "paddusw %%mm1, %%mm0 \n\t" \
752 HSUM(%%mm0, %%mm1, %0)
754 #define DCT_SAD_SSE2 \
755 "pxor %%xmm0, %%xmm0 \n\t" \
756 "pxor %%xmm1, %%xmm1 \n\t" \
757 DCT_SAD4(dqa, %%xmm, 0) \
758 DCT_SAD4(dqa, %%xmm, 64) \
759 "paddusw %%xmm1, %%xmm0 \n\t" \
760 HSUM(%%xmm0, %%xmm1, %0)
762 #define DCT_SAD_FUNC(cpu) \
763 static int sum_abs_dctelem_ ## cpu(int16_t *block) \
770 return sum & 0xFFFF; \
773 #define DCT_SAD DCT_SAD_MMX
774 #define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
775 #define MMABS(a, z) MMABS_MMX(a, z)
780 #define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
781 #define MMABS(a, z) MMABS_MMXEXT(a, z)
786 #define DCT_SAD DCT_SAD_SSE2
787 #define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
791 #if HAVE_SSSE3_INLINE
792 #define MMABS(a, z) MMABS_SSSE3(a, z)
800 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
801 0x0000000000000000ULL,
802 0x0001000100010001ULL,
803 0x0002000200020002ULL,
806 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
808 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
809 ptrdiff_t stride, int h)
811 x86_reg len = -(stride * h);
815 "movq (%1, %%"REG_a"), %%mm0 \n\t"
816 "movq (%2, %%"REG_a"), %%mm2 \n\t"
817 "movq (%2, %%"REG_a"), %%mm4 \n\t"
818 "add %3, %%"REG_a" \n\t"
819 "psubusb %%mm0, %%mm2 \n\t"
820 "psubusb %%mm4, %%mm0 \n\t"
821 "movq (%1, %%"REG_a"), %%mm1 \n\t"
822 "movq (%2, %%"REG_a"), %%mm3 \n\t"
823 "movq (%2, %%"REG_a"), %%mm5 \n\t"
824 "psubusb %%mm1, %%mm3 \n\t"
825 "psubusb %%mm5, %%mm1 \n\t"
826 "por %%mm2, %%mm0 \n\t"
827 "por %%mm1, %%mm3 \n\t"
828 "movq %%mm0, %%mm1 \n\t"
829 "movq %%mm3, %%mm2 \n\t"
830 "punpcklbw %%mm7, %%mm0 \n\t"
831 "punpckhbw %%mm7, %%mm1 \n\t"
832 "punpcklbw %%mm7, %%mm3 \n\t"
833 "punpckhbw %%mm7, %%mm2 \n\t"
834 "paddw %%mm1, %%mm0 \n\t"
835 "paddw %%mm3, %%mm2 \n\t"
836 "paddw %%mm2, %%mm0 \n\t"
837 "paddw %%mm0, %%mm6 \n\t"
838 "add %3, %%"REG_a" \n\t"
841 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
844 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
845 ptrdiff_t stride, int h)
850 "movq (%1), %%mm0 \n\t"
851 "movq (%1, %3), %%mm1 \n\t"
852 "psadbw (%2), %%mm0 \n\t"
853 "psadbw (%2, %3), %%mm1 \n\t"
854 "paddw %%mm0, %%mm6 \n\t"
855 "paddw %%mm1, %%mm6 \n\t"
856 "lea (%1,%3,2), %1 \n\t"
857 "lea (%2,%3,2), %2 \n\t"
860 : "+r" (h), "+r" (blk1), "+r" (blk2)
864 static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
865 ptrdiff_t stride, int h)
869 "pxor %%xmm2, %%xmm2 \n\t"
872 "movdqu (%1), %%xmm0 \n\t"
873 "movdqu (%1, %4), %%xmm1 \n\t"
874 "psadbw (%2), %%xmm0 \n\t"
875 "psadbw (%2, %4), %%xmm1 \n\t"
876 "paddw %%xmm0, %%xmm2 \n\t"
877 "paddw %%xmm1, %%xmm2 \n\t"
878 "lea (%1,%4,2), %1 \n\t"
879 "lea (%2,%4,2), %2 \n\t"
882 "movhlps %%xmm2, %%xmm0 \n\t"
883 "paddw %%xmm0, %%xmm2 \n\t"
884 "movd %%xmm2, %3 \n\t"
885 : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
890 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
891 ptrdiff_t stride, int h)
896 "movq (%1), %%mm0 \n\t"
897 "movq (%1, %3), %%mm1 \n\t"
898 "pavgb 1(%1), %%mm0 \n\t"
899 "pavgb 1(%1, %3), %%mm1 \n\t"
900 "psadbw (%2), %%mm0 \n\t"
901 "psadbw (%2, %3), %%mm1 \n\t"
902 "paddw %%mm0, %%mm6 \n\t"
903 "paddw %%mm1, %%mm6 \n\t"
904 "lea (%1,%3,2), %1 \n\t"
905 "lea (%2,%3,2), %2 \n\t"
908 : "+r" (h), "+r" (blk1), "+r" (blk2)
912 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
913 ptrdiff_t stride, int h)
916 "movq (%1), %%mm0 \n\t"
920 "movq (%1), %%mm1 \n\t"
921 "movq (%1, %3), %%mm2 \n\t"
922 "pavgb %%mm1, %%mm0 \n\t"
923 "pavgb %%mm2, %%mm1 \n\t"
924 "psadbw (%2), %%mm0 \n\t"
925 "psadbw (%2, %3), %%mm1 \n\t"
926 "paddw %%mm0, %%mm6 \n\t"
927 "paddw %%mm1, %%mm6 \n\t"
928 "movq %%mm2, %%mm0 \n\t"
929 "lea (%1,%3,2), %1 \n\t"
930 "lea (%2,%3,2), %2 \n\t"
933 : "+r" (h), "+r" (blk1), "+r" (blk2)
937 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
938 ptrdiff_t stride, int h)
941 "movq "MANGLE(bone)", %%mm5 \n\t"
942 "movq (%1), %%mm0 \n\t"
943 "pavgb 1(%1), %%mm0 \n\t"
947 "movq (%1), %%mm1 \n\t"
948 "movq (%1,%3), %%mm2 \n\t"
949 "pavgb 1(%1), %%mm1 \n\t"
950 "pavgb 1(%1,%3), %%mm2 \n\t"
951 "psubusb %%mm5, %%mm1 \n\t"
952 "pavgb %%mm1, %%mm0 \n\t"
953 "pavgb %%mm2, %%mm1 \n\t"
954 "psadbw (%2), %%mm0 \n\t"
955 "psadbw (%2,%3), %%mm1 \n\t"
956 "paddw %%mm0, %%mm6 \n\t"
957 "paddw %%mm1, %%mm6 \n\t"
958 "movq %%mm2, %%mm0 \n\t"
959 "lea (%1,%3,2), %1 \n\t"
960 "lea (%2,%3,2), %2 \n\t"
963 : "+r" (h), "+r" (blk1), "+r" (blk2)
967 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
968 ptrdiff_t stride, int h)
970 x86_reg len = -(stride * h);
974 "movq (%1, %%"REG_a"), %%mm0 \n\t"
975 "movq (%2, %%"REG_a"), %%mm1 \n\t"
976 "movq (%1, %%"REG_a"), %%mm2 \n\t"
977 "movq (%2, %%"REG_a"), %%mm3 \n\t"
978 "punpcklbw %%mm7, %%mm0 \n\t"
979 "punpcklbw %%mm7, %%mm1 \n\t"
980 "punpckhbw %%mm7, %%mm2 \n\t"
981 "punpckhbw %%mm7, %%mm3 \n\t"
982 "paddw %%mm0, %%mm1 \n\t"
983 "paddw %%mm2, %%mm3 \n\t"
984 "movq (%3, %%"REG_a"), %%mm4 \n\t"
985 "movq (%3, %%"REG_a"), %%mm2 \n\t"
986 "paddw %%mm5, %%mm1 \n\t"
987 "paddw %%mm5, %%mm3 \n\t"
988 "psrlw $1, %%mm1 \n\t"
989 "psrlw $1, %%mm3 \n\t"
990 "packuswb %%mm3, %%mm1 \n\t"
991 "psubusb %%mm1, %%mm4 \n\t"
992 "psubusb %%mm2, %%mm1 \n\t"
993 "por %%mm4, %%mm1 \n\t"
994 "movq %%mm1, %%mm0 \n\t"
995 "punpcklbw %%mm7, %%mm0 \n\t"
996 "punpckhbw %%mm7, %%mm1 \n\t"
997 "paddw %%mm1, %%mm0 \n\t"
998 "paddw %%mm0, %%mm6 \n\t"
999 "add %4, %%"REG_a" \n\t"
1002 : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
1006 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
1007 ptrdiff_t stride, int h)
1009 x86_reg len = -(stride * h);
1011 "movq (%1, %%"REG_a"), %%mm0 \n\t"
1012 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
1013 "movq %%mm0, %%mm1 \n\t"
1014 "movq %%mm2, %%mm3 \n\t"
1015 "punpcklbw %%mm7, %%mm0 \n\t"
1016 "punpckhbw %%mm7, %%mm1 \n\t"
1017 "punpcklbw %%mm7, %%mm2 \n\t"
1018 "punpckhbw %%mm7, %%mm3 \n\t"
1019 "paddw %%mm2, %%mm0 \n\t"
1020 "paddw %%mm3, %%mm1 \n\t"
1023 "movq (%2, %%"REG_a"), %%mm2 \n\t"
1024 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
1025 "movq %%mm2, %%mm3 \n\t"
1026 "movq %%mm4, %%mm5 \n\t"
1027 "punpcklbw %%mm7, %%mm2 \n\t"
1028 "punpckhbw %%mm7, %%mm3 \n\t"
1029 "punpcklbw %%mm7, %%mm4 \n\t"
1030 "punpckhbw %%mm7, %%mm5 \n\t"
1031 "paddw %%mm4, %%mm2 \n\t"
1032 "paddw %%mm5, %%mm3 \n\t"
1033 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
1034 "paddw %%mm2, %%mm0 \n\t"
1035 "paddw %%mm3, %%mm1 \n\t"
1036 "paddw %%mm5, %%mm0 \n\t"
1037 "paddw %%mm5, %%mm1 \n\t"
1038 "movq (%3, %%"REG_a"), %%mm4 \n\t"
1039 "movq (%3, %%"REG_a"), %%mm5 \n\t"
1040 "psrlw $2, %%mm0 \n\t"
1041 "psrlw $2, %%mm1 \n\t"
1042 "packuswb %%mm1, %%mm0 \n\t"
1043 "psubusb %%mm0, %%mm4 \n\t"
1044 "psubusb %%mm5, %%mm0 \n\t"
1045 "por %%mm4, %%mm0 \n\t"
1046 "movq %%mm0, %%mm4 \n\t"
1047 "punpcklbw %%mm7, %%mm0 \n\t"
1048 "punpckhbw %%mm7, %%mm4 \n\t"
1049 "paddw %%mm0, %%mm6 \n\t"
1050 "paddw %%mm4, %%mm6 \n\t"
1051 "movq %%mm2, %%mm0 \n\t"
1052 "movq %%mm3, %%mm1 \n\t"
1053 "add %4, %%"REG_a" \n\t"
1056 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
1060 static inline int sum_mmx(void)
1064 "movq %%mm6, %%mm0 \n\t"
1065 "psrlq $32, %%mm6 \n\t"
1066 "paddw %%mm0, %%mm6 \n\t"
1067 "movq %%mm6, %%mm0 \n\t"
1068 "psrlq $16, %%mm6 \n\t"
1069 "paddw %%mm0, %%mm6 \n\t"
1070 "movd %%mm6, %0 \n\t"
1072 return ret & 0xFFFF;
1075 static inline int sum_mmxext(void)
1079 "movd %%mm6, %0 \n\t"
1084 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
1085 ptrdiff_t stride, int h)
1087 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
1090 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
1091 ptrdiff_t stride, int h)
1093 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
1096 #define PIX_SAD(suf) \
1097 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1098 uint8_t *blk1, ptrdiff_t stride, int h) \
1101 __asm__ volatile ( \
1102 "pxor %%mm7, %%mm7 \n\t" \
1103 "pxor %%mm6, %%mm6 \n\t" \
1106 sad8_1_ ## suf(blk1, blk2, stride, 8); \
1108 return sum_ ## suf(); \
1111 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1112 uint8_t *blk1, ptrdiff_t stride, int h) \
1115 __asm__ volatile ( \
1116 "pxor %%mm7, %%mm7 \n\t" \
1117 "pxor %%mm6, %%mm6 \n\t" \
1118 "movq %0, %%mm5 \n\t" \
1119 :: "m" (round_tab[1])); \
1121 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
1123 return sum_ ## suf(); \
1126 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1127 uint8_t *blk1, ptrdiff_t stride, int h) \
1130 __asm__ volatile ( \
1131 "pxor %%mm7, %%mm7 \n\t" \
1132 "pxor %%mm6, %%mm6 \n\t" \
1133 "movq %0, %%mm5 \n\t" \
1134 :: "m" (round_tab[1])); \
1136 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
1138 return sum_ ## suf(); \
1141 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1142 uint8_t *blk1, ptrdiff_t stride, int h) \
1145 __asm__ volatile ( \
1146 "pxor %%mm7, %%mm7 \n\t" \
1147 "pxor %%mm6, %%mm6 \n\t" \
1150 sad8_4_ ## suf(blk1, blk2, stride, 8); \
1152 return sum_ ## suf(); \
1155 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1156 uint8_t *blk1, ptrdiff_t stride, int h) \
1158 __asm__ volatile ( \
1159 "pxor %%mm7, %%mm7 \n\t" \
1160 "pxor %%mm6, %%mm6 \n\t" \
1163 sad8_1_ ## suf(blk1, blk2, stride, h); \
1164 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
1166 return sum_ ## suf(); \
1169 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1170 uint8_t *blk1, ptrdiff_t stride, int h) \
1172 __asm__ volatile ( \
1173 "pxor %%mm7, %%mm7 \n\t" \
1174 "pxor %%mm6, %%mm6 \n\t" \
1175 "movq %0, %%mm5 \n\t" \
1176 :: "m" (round_tab[1])); \
1178 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
1179 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
1181 return sum_ ## suf(); \
1184 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1185 uint8_t *blk1, ptrdiff_t stride, int h) \
1187 __asm__ volatile ( \
1188 "pxor %%mm7, %%mm7 \n\t" \
1189 "pxor %%mm6, %%mm6 \n\t" \
1190 "movq %0, %%mm5 \n\t" \
1191 :: "m" (round_tab[1])); \
1193 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
1194 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
1196 return sum_ ## suf(); \
1199 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
1200 uint8_t *blk1, ptrdiff_t stride, int h) \
1202 __asm__ volatile ( \
1203 "pxor %%mm7, %%mm7 \n\t" \
1204 "pxor %%mm6, %%mm6 \n\t" \
1207 sad8_4_ ## suf(blk1, blk2, stride, h); \
1208 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
1210 return sum_ ## suf(); \
1216 #endif /* HAVE_INLINE_ASM */
1218 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1219 ptrdiff_t stride, int h);
1221 #define hadamard_func(cpu) \
1222 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
1223 uint8_t *src2, ptrdiff_t stride, int h); \
1224 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
1225 uint8_t *src2, ptrdiff_t stride, int h);
1228 hadamard_func(mmxext)
1230 hadamard_func(ssse3)
1232 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
1234 int cpu_flags = av_get_cpu_flags();
1237 if (INLINE_MMX(cpu_flags)) {
1238 c->sum_abs_dctelem = sum_abs_dctelem_mmx;
1240 c->pix_abs[0][0] = sad16_mmx;
1241 c->pix_abs[0][1] = sad16_x2_mmx;
1242 c->pix_abs[0][2] = sad16_y2_mmx;
1243 c->pix_abs[0][3] = sad16_xy2_mmx;
1244 c->pix_abs[1][0] = sad8_mmx;
1245 c->pix_abs[1][1] = sad8_x2_mmx;
1246 c->pix_abs[1][2] = sad8_y2_mmx;
1247 c->pix_abs[1][3] = sad8_xy2_mmx;
1249 c->sad[0] = sad16_mmx;
1250 c->sad[1] = sad8_mmx;
1252 c->sse[0] = sse16_mmx;
1253 c->sse[1] = sse8_mmx;
1254 c->vsad[4] = vsad_intra16_mmx;
1256 c->nsse[0] = nsse16_mmx;
1257 c->nsse[1] = nsse8_mmx;
1259 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1260 c->vsad[0] = vsad16_mmx;
1264 if (INLINE_MMXEXT(cpu_flags)) {
1265 c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
1267 c->vsad[4] = vsad_intra16_mmxext;
1269 c->pix_abs[0][0] = sad16_mmxext;
1270 c->pix_abs[1][0] = sad8_mmxext;
1272 c->sad[0] = sad16_mmxext;
1273 c->sad[1] = sad8_mmxext;
1275 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1276 c->pix_abs[0][1] = sad16_x2_mmxext;
1277 c->pix_abs[0][2] = sad16_y2_mmxext;
1278 c->pix_abs[0][3] = sad16_xy2_mmxext;
1279 c->pix_abs[1][1] = sad8_x2_mmxext;
1280 c->pix_abs[1][2] = sad8_y2_mmxext;
1281 c->pix_abs[1][3] = sad8_xy2_mmxext;
1283 c->vsad[0] = vsad16_mmxext;
1287 if (INLINE_SSE2(cpu_flags)) {
1288 c->sum_abs_dctelem = sum_abs_dctelem_sse2;
1291 if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
1292 c->sad[0] = sad16_sse2;
1295 #if HAVE_SSSE3_INLINE
1296 if (INLINE_SSSE3(cpu_flags)) {
1297 c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
1300 #endif /* HAVE_INLINE_ASM */
1302 if (EXTERNAL_MMX(cpu_flags)) {
1303 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
1304 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
1307 if (EXTERNAL_MMXEXT(cpu_flags)) {
1308 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
1309 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
1312 if (EXTERNAL_SSE2(cpu_flags)) {
1313 c->sse[0] = ff_sse16_sse2;
1315 #if HAVE_ALIGNED_STACK
1316 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
1317 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
1321 if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
1322 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
1323 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;