2 * SIMD-optimized motion estimation
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
32 int ff_sum_abs_dctelem_mmx(int16_t *block);
33 int ff_sum_abs_dctelem_mmxext(int16_t *block);
34 int ff_sum_abs_dctelem_sse2(int16_t *block);
35 int ff_sum_abs_dctelem_ssse3(int16_t *block);
36 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37 ptrdiff_t stride, int h);
38 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39 ptrdiff_t stride, int h);
40 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41 ptrdiff_t stride, int h);
42 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
43 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
45 ptrdiff_t stride, int h);
46 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47 ptrdiff_t stride, int h);
48 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
49 ptrdiff_t stride, int h);
50 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
51 ptrdiff_t stride, int h);
52 int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
53 ptrdiff_t stride, int h);
54 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
55 ptrdiff_t stride, int h);
56 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
57 ptrdiff_t stride, int h);
58 int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
59 ptrdiff_t stride, int h);
60 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
61 ptrdiff_t stride, int h);
62 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
63 ptrdiff_t stride, int h);
64 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
65 ptrdiff_t stride, int h);
66 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
67 ptrdiff_t stride, int h);
68 int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
69 ptrdiff_t stride, int h);
70 int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
71 ptrdiff_t stride, int h);
72 int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
73 ptrdiff_t stride, int h);
74 int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
75 ptrdiff_t stride, int h);
76 int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
77 ptrdiff_t stride, int h);
78 int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
79 ptrdiff_t stride, int h);
81 #define hadamard_func(cpu) \
82 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
83 uint8_t *src2, ptrdiff_t stride, int h); \
84 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
85 uint8_t *src2, ptrdiff_t stride, int h);
93 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
94 ptrdiff_t stride, int h)
99 score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
101 score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
102 score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
103 - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
106 return score1 + FFABS(score2) * c->avctx->nsse_weight;
108 return score1 + FFABS(score2) * 8;
111 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
112 ptrdiff_t stride, int h)
114 int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
115 int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
116 ff_hf_noise8_mmx(pix2, stride, h);
119 return score1 + FFABS(score2) * c->avctx->nsse_weight;
121 return score1 + FFABS(score2) * 8;
124 #endif /* HAVE_YASM */
128 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
129 ptrdiff_t stride, int h)
133 av_assert2((((int) pix) & 7) == 0);
134 av_assert2((stride & 7) == 0);
136 #define SUM(in0, in1, out0, out1) \
137 "movq (%0), %%mm2\n" \
138 "movq 8(%0), %%mm3\n" \
140 "movq %%mm2, " #out0 "\n" \
141 "movq %%mm3, " #out1 "\n" \
142 "psubusb " #in0 ", %%mm2\n" \
143 "psubusb " #in1 ", %%mm3\n" \
144 "psubusb " #out0 ", " #in0 "\n" \
145 "psubusb " #out1 ", " #in1 "\n" \
146 "por %%mm2, " #in0 "\n" \
147 "por %%mm3, " #in1 "\n" \
148 "movq " #in0 ", %%mm2\n" \
149 "movq " #in1 ", %%mm3\n" \
150 "punpcklbw %%mm7, " #in0 "\n" \
151 "punpcklbw %%mm7, " #in1 "\n" \
152 "punpckhbw %%mm7, %%mm2\n" \
153 "punpckhbw %%mm7, %%mm3\n" \
154 "paddw " #in1 ", " #in0 "\n" \
155 "paddw %%mm3, %%mm2\n" \
156 "paddw %%mm2, " #in0 "\n" \
157 "paddw " #in0 ", %%mm6\n"
162 "pxor %%mm6, %%mm6\n"
163 "pxor %%mm7, %%mm7\n"
165 "movq 8(%0), %%mm1\n"
170 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
177 "movq %%mm6, %%mm0\n"
179 "paddw %%mm6, %%mm0\n"
180 "movq %%mm0, %%mm6\n"
182 "paddw %%mm6, %%mm0\n"
184 : "+r" (pix), "=r" (tmp)
185 : "r" (stride), "m" (h)
192 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
193 ptrdiff_t stride, int h)
197 av_assert2((((int) pix1) & 7) == 0);
198 av_assert2((((int) pix2) & 7) == 0);
199 av_assert2((stride & 7) == 0);
201 #define SUM(in0, in1, out0, out1) \
202 "movq (%0), %%mm2\n" \
203 "movq (%1), " #out0 "\n" \
204 "movq 8(%0), %%mm3\n" \
205 "movq 8(%1), " #out1 "\n" \
208 "psubb " #out0 ", %%mm2\n" \
209 "psubb " #out1 ", %%mm3\n" \
210 "pxor %%mm7, %%mm2\n" \
211 "pxor %%mm7, %%mm3\n" \
212 "movq %%mm2, " #out0 "\n" \
213 "movq %%mm3, " #out1 "\n" \
214 "psubusb " #in0 ", %%mm2\n" \
215 "psubusb " #in1 ", %%mm3\n" \
216 "psubusb " #out0 ", " #in0 "\n" \
217 "psubusb " #out1 ", " #in1 "\n" \
218 "por %%mm2, " #in0 "\n" \
219 "por %%mm3, " #in1 "\n" \
220 "movq " #in0 ", %%mm2\n" \
221 "movq " #in1 ", %%mm3\n" \
222 "punpcklbw %%mm7, " #in0 "\n" \
223 "punpcklbw %%mm7, " #in1 "\n" \
224 "punpckhbw %%mm7, %%mm2\n" \
225 "punpckhbw %%mm7, %%mm3\n" \
226 "paddw " #in1 ", " #in0 "\n" \
227 "paddw %%mm3, %%mm2\n" \
228 "paddw %%mm2, " #in0 "\n" \
229 "paddw " #in0 ", %%mm6\n"
234 "pxor %%mm6, %%mm6\n"
235 "pcmpeqw %%mm7, %%mm7\n"
237 "packsswb %%mm7, %%mm7\n"
240 "movq 8(%0), %%mm1\n"
241 "movq 8(%1), %%mm3\n"
244 "psubb %%mm2, %%mm0\n"
245 "psubb %%mm3, %%mm1\n"
246 "pxor %%mm7, %%mm0\n"
247 "pxor %%mm7, %%mm1\n"
251 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
258 "movq %%mm6, %%mm0\n"
260 "paddw %%mm6, %%mm0\n"
261 "movq %%mm0, %%mm6\n"
263 "paddw %%mm6, %%mm0\n"
265 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
266 : "r" (stride), "m" (h)
273 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
274 0x0000000000000000ULL,
275 0x0001000100010001ULL,
276 0x0002000200020002ULL,
279 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
280 ptrdiff_t stride, int h)
282 x86_reg len = -stride * h;
286 "movq (%1, %%"REG_a"), %%mm0 \n\t"
287 "movq (%2, %%"REG_a"), %%mm2 \n\t"
288 "movq (%2, %%"REG_a"), %%mm4 \n\t"
289 "add %3, %%"REG_a" \n\t"
290 "psubusb %%mm0, %%mm2 \n\t"
291 "psubusb %%mm4, %%mm0 \n\t"
292 "movq (%1, %%"REG_a"), %%mm1 \n\t"
293 "movq (%2, %%"REG_a"), %%mm3 \n\t"
294 "movq (%2, %%"REG_a"), %%mm5 \n\t"
295 "psubusb %%mm1, %%mm3 \n\t"
296 "psubusb %%mm5, %%mm1 \n\t"
297 "por %%mm2, %%mm0 \n\t"
298 "por %%mm1, %%mm3 \n\t"
299 "movq %%mm0, %%mm1 \n\t"
300 "movq %%mm3, %%mm2 \n\t"
301 "punpcklbw %%mm7, %%mm0 \n\t"
302 "punpckhbw %%mm7, %%mm1 \n\t"
303 "punpcklbw %%mm7, %%mm3 \n\t"
304 "punpckhbw %%mm7, %%mm2 \n\t"
305 "paddw %%mm1, %%mm0 \n\t"
306 "paddw %%mm3, %%mm2 \n\t"
307 "paddw %%mm2, %%mm0 \n\t"
308 "paddw %%mm0, %%mm6 \n\t"
309 "add %3, %%"REG_a" \n\t"
312 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
315 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
316 ptrdiff_t stride, int h)
318 x86_reg len = -stride * h;
322 "movq (%1, %%"REG_a"), %%mm0 \n\t"
323 "movq (%2, %%"REG_a"), %%mm1 \n\t"
324 "movq (%1, %%"REG_a"), %%mm2 \n\t"
325 "movq (%2, %%"REG_a"), %%mm3 \n\t"
326 "punpcklbw %%mm7, %%mm0 \n\t"
327 "punpcklbw %%mm7, %%mm1 \n\t"
328 "punpckhbw %%mm7, %%mm2 \n\t"
329 "punpckhbw %%mm7, %%mm3 \n\t"
330 "paddw %%mm0, %%mm1 \n\t"
331 "paddw %%mm2, %%mm3 \n\t"
332 "movq (%3, %%"REG_a"), %%mm4 \n\t"
333 "movq (%3, %%"REG_a"), %%mm2 \n\t"
334 "paddw %%mm5, %%mm1 \n\t"
335 "paddw %%mm5, %%mm3 \n\t"
336 "psrlw $1, %%mm1 \n\t"
337 "psrlw $1, %%mm3 \n\t"
338 "packuswb %%mm3, %%mm1 \n\t"
339 "psubusb %%mm1, %%mm4 \n\t"
340 "psubusb %%mm2, %%mm1 \n\t"
341 "por %%mm4, %%mm1 \n\t"
342 "movq %%mm1, %%mm0 \n\t"
343 "punpcklbw %%mm7, %%mm0 \n\t"
344 "punpckhbw %%mm7, %%mm1 \n\t"
345 "paddw %%mm1, %%mm0 \n\t"
346 "paddw %%mm0, %%mm6 \n\t"
347 "add %4, %%"REG_a" \n\t"
350 : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
354 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
355 ptrdiff_t stride, int h)
357 x86_reg len = -stride * h;
359 "movq (%1, %%"REG_a"), %%mm0 \n\t"
360 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
361 "movq %%mm0, %%mm1 \n\t"
362 "movq %%mm2, %%mm3 \n\t"
363 "punpcklbw %%mm7, %%mm0 \n\t"
364 "punpckhbw %%mm7, %%mm1 \n\t"
365 "punpcklbw %%mm7, %%mm2 \n\t"
366 "punpckhbw %%mm7, %%mm3 \n\t"
367 "paddw %%mm2, %%mm0 \n\t"
368 "paddw %%mm3, %%mm1 \n\t"
371 "movq (%2, %%"REG_a"), %%mm2 \n\t"
372 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
373 "movq %%mm2, %%mm3 \n\t"
374 "movq %%mm4, %%mm5 \n\t"
375 "punpcklbw %%mm7, %%mm2 \n\t"
376 "punpckhbw %%mm7, %%mm3 \n\t"
377 "punpcklbw %%mm7, %%mm4 \n\t"
378 "punpckhbw %%mm7, %%mm5 \n\t"
379 "paddw %%mm4, %%mm2 \n\t"
380 "paddw %%mm5, %%mm3 \n\t"
381 "movq %5, %%mm5 \n\t"
382 "paddw %%mm2, %%mm0 \n\t"
383 "paddw %%mm3, %%mm1 \n\t"
384 "paddw %%mm5, %%mm0 \n\t"
385 "paddw %%mm5, %%mm1 \n\t"
386 "movq (%3, %%"REG_a"), %%mm4 \n\t"
387 "movq (%3, %%"REG_a"), %%mm5 \n\t"
388 "psrlw $2, %%mm0 \n\t"
389 "psrlw $2, %%mm1 \n\t"
390 "packuswb %%mm1, %%mm0 \n\t"
391 "psubusb %%mm0, %%mm4 \n\t"
392 "psubusb %%mm5, %%mm0 \n\t"
393 "por %%mm4, %%mm0 \n\t"
394 "movq %%mm0, %%mm4 \n\t"
395 "punpcklbw %%mm7, %%mm0 \n\t"
396 "punpckhbw %%mm7, %%mm4 \n\t"
397 "paddw %%mm0, %%mm6 \n\t"
398 "paddw %%mm4, %%mm6 \n\t"
399 "movq %%mm2, %%mm0 \n\t"
400 "movq %%mm3, %%mm1 \n\t"
401 "add %4, %%"REG_a" \n\t"
404 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
405 "r" (stride), "m" (round_tab[2]));
408 static inline int sum_mmx(void)
412 "movq %%mm6, %%mm0 \n\t"
413 "psrlq $32, %%mm6 \n\t"
414 "paddw %%mm0, %%mm6 \n\t"
415 "movq %%mm6, %%mm0 \n\t"
416 "psrlq $16, %%mm6 \n\t"
417 "paddw %%mm0, %%mm6 \n\t"
418 "movd %%mm6, %0 \n\t"
423 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
424 ptrdiff_t stride, int h)
426 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
429 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
430 ptrdiff_t stride, int h)
432 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
435 #define PIX_SAD(suf) \
436 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
437 uint8_t *blk1, ptrdiff_t stride, int h) \
439 av_assert2(h == 8); \
441 "pxor %%mm7, %%mm7 \n\t" \
442 "pxor %%mm6, %%mm6 \n\t" \
445 sad8_1_ ## suf(blk1, blk2, stride, 8); \
447 return sum_ ## suf(); \
450 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
451 uint8_t *blk1, ptrdiff_t stride, int h) \
453 av_assert2(h == 8); \
455 "pxor %%mm7, %%mm7 \n\t" \
456 "pxor %%mm6, %%mm6 \n\t" \
457 "movq %0, %%mm5 \n\t" \
458 :: "m" (round_tab[1])); \
460 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
462 return sum_ ## suf(); \
465 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
466 uint8_t *blk1, ptrdiff_t stride, int h) \
468 av_assert2(h == 8); \
470 "pxor %%mm7, %%mm7 \n\t" \
471 "pxor %%mm6, %%mm6 \n\t" \
472 "movq %0, %%mm5 \n\t" \
473 :: "m" (round_tab[1])); \
475 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
477 return sum_ ## suf(); \
480 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
481 uint8_t *blk1, ptrdiff_t stride, int h) \
483 av_assert2(h == 8); \
485 "pxor %%mm7, %%mm7 \n\t" \
486 "pxor %%mm6, %%mm6 \n\t" \
489 sad8_4_ ## suf(blk1, blk2, stride, 8); \
491 return sum_ ## suf(); \
494 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
495 uint8_t *blk1, ptrdiff_t stride, int h) \
498 "pxor %%mm7, %%mm7 \n\t" \
499 "pxor %%mm6, %%mm6 \n\t" \
502 sad8_1_ ## suf(blk1, blk2, stride, h); \
503 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
505 return sum_ ## suf(); \
508 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
509 uint8_t *blk1, ptrdiff_t stride, int h) \
512 "pxor %%mm7, %%mm7 \n\t" \
513 "pxor %%mm6, %%mm6 \n\t" \
514 "movq %0, %%mm5 \n\t" \
515 :: "m" (round_tab[1])); \
517 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
518 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
520 return sum_ ## suf(); \
523 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
524 uint8_t *blk1, ptrdiff_t stride, int h) \
527 "pxor %%mm7, %%mm7 \n\t" \
528 "pxor %%mm6, %%mm6 \n\t" \
529 "movq %0, %%mm5 \n\t" \
530 :: "m" (round_tab[1])); \
532 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
533 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
535 return sum_ ## suf(); \
538 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
539 uint8_t *blk1, ptrdiff_t stride, int h) \
542 "pxor %%mm7, %%mm7 \n\t" \
543 "pxor %%mm6, %%mm6 \n\t" \
546 sad8_4_ ## suf(blk1, blk2, stride, h); \
547 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
549 return sum_ ## suf(); \
554 #endif /* HAVE_INLINE_ASM */
556 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
558 int cpu_flags = av_get_cpu_flags();
561 if (INLINE_MMX(cpu_flags)) {
562 c->pix_abs[0][0] = sad16_mmx;
563 c->pix_abs[0][1] = sad16_x2_mmx;
564 c->pix_abs[0][2] = sad16_y2_mmx;
565 c->pix_abs[0][3] = sad16_xy2_mmx;
566 c->pix_abs[1][0] = sad8_mmx;
567 c->pix_abs[1][1] = sad8_x2_mmx;
568 c->pix_abs[1][2] = sad8_y2_mmx;
569 c->pix_abs[1][3] = sad8_xy2_mmx;
571 c->sad[0] = sad16_mmx;
572 c->sad[1] = sad8_mmx;
574 c->vsad[4] = vsad_intra16_mmx;
576 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
577 c->vsad[0] = vsad16_mmx;
581 #endif /* HAVE_INLINE_ASM */
583 if (EXTERNAL_MMX(cpu_flags)) {
584 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
585 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
586 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
587 c->sse[0] = ff_sse16_mmx;
588 c->sse[1] = ff_sse8_mmx;
590 c->nsse[0] = nsse16_mmx;
591 c->nsse[1] = nsse8_mmx;
595 if (EXTERNAL_MMXEXT(cpu_flags)) {
596 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
597 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
598 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
600 c->sad[0] = ff_sad16_mmxext;
601 c->sad[1] = ff_sad8_mmxext;
603 c->pix_abs[0][0] = ff_sad16_mmxext;
604 c->pix_abs[0][1] = ff_sad16_x2_mmxext;
605 c->pix_abs[0][2] = ff_sad16_y2_mmxext;
606 c->pix_abs[1][0] = ff_sad8_mmxext;
607 c->pix_abs[1][1] = ff_sad8_x2_mmxext;
608 c->pix_abs[1][2] = ff_sad8_y2_mmxext;
610 c->vsad[4] = ff_vsad_intra16_mmxext;
611 c->vsad[5] = ff_vsad_intra8_mmxext;
613 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
614 c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
615 c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
617 c->vsad[0] = ff_vsad16_approx_mmxext;
618 c->vsad[1] = ff_vsad8_approx_mmxext;
622 if (EXTERNAL_SSE2(cpu_flags)) {
623 c->sse[0] = ff_sse16_sse2;
624 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
626 #if HAVE_ALIGNED_STACK
627 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
628 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
630 if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
631 c->sad[0] = ff_sad16_sse2;
632 c->pix_abs[0][0] = ff_sad16_sse2;
633 c->pix_abs[0][1] = ff_sad16_x2_sse2;
634 c->pix_abs[0][2] = ff_sad16_y2_sse2;
636 c->vsad[4] = ff_vsad_intra16_sse2;
637 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
638 c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
639 c->vsad[0] = ff_vsad16_approx_sse2;
644 if (EXTERNAL_SSSE3(cpu_flags)) {
645 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
646 #if HAVE_ALIGNED_STACK
647 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
648 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;