1 ;*****************************************************************************
2 ;* sad-a.asm: x86 sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
35 h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
36 h8x8_pred_shuf: times 8 db 1
52 ;=============================================================================
54 ;=============================================================================
56 %macro SAD_INC_2x16P 0
87 punpckldq mm1, [r0+r1]
88 punpckldq mm2, [r2+r3]
95 ;-----------------------------------------------------------------------------
96 ; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
97 ;-----------------------------------------------------------------------------
99 cglobal pixel_sad_%1x%2_mmx2, 4,4
119 ;=============================================================================
121 ;=============================================================================
123 %macro SAD_END_SSE2 0
131 ;-----------------------------------------------------------------------------
132 ; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
133 ;-----------------------------------------------------------------------------
134 cglobal pixel_sad_16x16, 4,4,8
198 ;-----------------------------------------------------------------------------
199 ; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
200 ;-----------------------------------------------------------------------------
201 cglobal pixel_sad_16x8, 4,4
240 INIT_XMM sse2, aligned
243 %macro SAD_INC_4x8P_SSE 1
267 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
268 cglobal pixel_sad_8x16_sse2, 4,4
276 ;-----------------------------------------------------------------------------
277 ; void pixel_vsad( pixel *src, int stride );
278 ;-----------------------------------------------------------------------------
282 cglobal pixel_vsad_mmx2, 3,3
317 cglobal pixel_vsad_sse2, 3,3
341 ;-----------------------------------------------------------------------------
342 ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
343 ;-----------------------------------------------------------------------------
345 cglobal intra_sad_x3_4x4_mmx2, 3,3
347 movd mm0, [r1-FDEC_STRIDE]
348 movd mm1, [r0+FENC_STRIDE*0]
349 movd mm2, [r0+FENC_STRIDE*2]
351 punpckldq mm1, [r0+FENC_STRIDE*1]
352 punpckldq mm2, [r0+FENC_STRIDE*3]
358 movd [r2], mm0 ;V prediction cost
359 movd mm3, [r1+FDEC_STRIDE*0-4]
360 movd mm0, [r1+FDEC_STRIDE*1-4]
361 movd mm4, [r1+FDEC_STRIDE*2-4]
362 movd mm5, [r1+FDEC_STRIDE*3-4]
376 pshufw mm5, mm5, 0 ;DC prediction
384 movd [r2+8], mm5 ;DC prediction cost
385 movd [r2+4], mm1 ;H prediction cost
388 %macro INTRA_SADx3_4x4 0
389 cglobal intra_sad_x3_4x4, 3,3
390 movd xmm4, [r1+FDEC_STRIDE*0-4]
391 pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
392 pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
393 pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
394 movd xmm2, [r1-FDEC_STRIDE]
396 pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
397 pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
398 pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
399 punpckldq xmm2, xmm4 ; ABCDEFGH
401 movd xmm1, [r0+FENC_STRIDE*0]
402 pinsrd xmm1, [r0+FENC_STRIDE*1], 1
403 pinsrd xmm1, [r0+FENC_STRIDE*2], 2
404 pinsrd xmm1, [r0+FENC_STRIDE*3], 3
409 pshufb xmm2, xmm3 ; DC prediction
410 punpckhqdq xmm3, xmm0, xmm5
411 punpcklqdq xmm0, xmm5
417 movq [r2], xmm0 ; V/H prediction costs
418 movd [r2+8], xmm2 ; DC prediction cost
420 %endmacro ; INTRA_SADx3_4x4
427 ;-----------------------------------------------------------------------------
428 ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
429 ;-----------------------------------------------------------------------------
440 %macro INTRA_SAD_HVDC_ITER 2
441 movq m5, [r0+FENC_STRIDE*%1]
466 cglobal intra_sad_x3_8x8_mmx2, 3,3
469 movq m6, [r1+16] ;V prediction
477 pshufw m0, m0, q0000 ;DC prediction
479 INTRA_SAD_HVDC_ITER 0, q3333
480 INTRA_SAD_HVDC_ITER 1, q2222
481 INTRA_SAD_HVDC_ITER 2, q1111
482 INTRA_SAD_HVDC_ITER 3, q0000
485 INTRA_SAD_HVDC_ITER 4, q3333
486 INTRA_SAD_HVDC_ITER 5, q2222
487 INTRA_SAD_HVDC_ITER 6, q1111
488 INTRA_SAD_HVDC_ITER 7, q0000
494 %macro INTRA_SADx3_8x8 0
495 cglobal intra_sad_x3_8x8, 3,4,9
497 lea r11, [h8x8_pred_shuf]
500 %define shuf h8x8_pred_shuf
502 movq m0, [r1+7] ; left pixels
503 movq m1, [r1+16] ; top pixels
509 pxor m3, m3 ; V score accumulator
512 punpcklqdq m1, m1 ; V prediction
513 pshufb m2, m3 ; DC prediction
514 pxor m4, m4 ; H score accumulator
515 pxor m5, m5 ; DC score accumulator
518 movq m6, [r0+FENC_STRIDE*0]
519 movhps m6, [r0+FENC_STRIDE*1]
520 pshufb m7, m0, [shuf+r3*8] ; H prediction
536 add r0, FENC_STRIDE*2
550 %endmacro ; INTRA_SADx3_8x8
557 ;-----------------------------------------------------------------------------
558 ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
559 ;-----------------------------------------------------------------------------
561 %macro INTRA_SAD_HV_ITER 1
563 movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
564 movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
568 movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
569 movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
575 movq m4, [r0 + FENC_STRIDE*(%1+0)]
576 movq m5, [r0 + FENC_STRIDE*(%1+1)]
592 %macro INTRA_SAD_8x8C 0
593 cglobal intra_sad_x3_8x8c, 3,3
594 movq m6, [r1 - FDEC_STRIDE]
595 add r1, FDEC_STRIDE*4
606 movq m2, [r1 + FDEC_STRIDE*-4 - 8]
607 movq m4, [r1 + FDEC_STRIDE*-2 - 8]
608 movq m3, [r1 + FDEC_STRIDE* 0 - 8]
609 movq m5, [r1 + FDEC_STRIDE* 2 - 8]
610 punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
611 punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
612 punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
613 punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
628 punpckldq m0, m2 ;s0 s1 s2 s3
629 pshufw m3, m0, q3312 ;s2,s1,s3,s3
630 pshufw m0, m0, q1310 ;s0,s1,s3,s1
633 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
636 pshufb xmm0, [pb_shuf8x8c]
637 movq xmm1, [r0+FENC_STRIDE*0]
638 movq xmm2, [r0+FENC_STRIDE*1]
639 movq xmm3, [r0+FENC_STRIDE*2]
640 movq xmm4, [r0+FENC_STRIDE*3]
641 movhps xmm1, [r0+FENC_STRIDE*4]
642 movhps xmm2, [r0+FENC_STRIDE*5]
643 movhps xmm3, [r0+FENC_STRIDE*6]
644 movhps xmm4, [r0+FENC_STRIDE*7]
659 punpcklbw m0, m0 ; 4x dc0 4x dc1
660 punpckhbw m1, m1 ; 4x dc2 4x dc3
661 movq m2, [r0+FENC_STRIDE*0]
662 movq m3, [r0+FENC_STRIDE*1]
663 movq m4, [r0+FENC_STRIDE*2]
664 movq m5, [r0+FENC_STRIDE*3]
665 movq m6, [r0+FENC_STRIDE*4]
666 movq m7, [r0+FENC_STRIDE*5]
671 movq m0, [r0+FENC_STRIDE*6]
675 psadbw m1, [r0+FENC_STRIDE*7]
694 ;-----------------------------------------------------------------------------
695 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
696 ;-----------------------------------------------------------------------------
698 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
699 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
701 cglobal intra_sad_x3_16x16, 3,5,8
704 psadbw mm0, [r1-FDEC_STRIDE+0]
705 psadbw mm1, [r1-FDEC_STRIDE+8]
713 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
714 %if (x&3)==3 && x!=15
715 add r1, FDEC_STRIDE*4
720 sub r1, FDEC_STRIDE*12
725 mova m5, [r1-FDEC_STRIDE]
729 mova m1, [r1-FDEC_STRIDE+8]
735 mov r3d, 15*FENC_STRIDE
737 SPLATB_LOAD m6, r1+r3*2-1, m1
760 add r3d, -FENC_STRIDE
787 ;=============================================================================
789 ;=============================================================================
791 %macro SAD_X3_START_1x8P 0
814 %macro SAD_X3_START_2x4P 3
819 punpckldq mm3, [r0+FENC_STRIDE]
820 punpckldq %1, [r1+r4]
821 punpckldq %2, [r2+r4]
822 punpckldq %3, [r3+r4]
828 %macro SAD_X3_2x16P 1
835 SAD_X3_1x8P FENC_STRIDE, r4
836 SAD_X3_1x8P FENC_STRIDE+8, r4+8
837 add r0, 2*FENC_STRIDE
849 SAD_X3_1x8P FENC_STRIDE, r4
850 add r0, 2*FENC_STRIDE
858 SAD_X3_START_2x4P mm0, mm1, mm2
860 SAD_X3_START_2x4P mm4, mm5, mm6
865 add r0, 2*FENC_STRIDE
871 %macro SAD_X4_START_1x8P 0
898 %macro SAD_X4_START_2x4P 0
904 punpckldq mm7, [r0+FENC_STRIDE]
905 punpckldq mm0, [r1+r5]
906 punpckldq mm1, [r2+r5]
907 punpckldq mm2, [r3+r5]
908 punpckldq mm3, [r4+r5]
915 %macro SAD_X4_INC_2x4P 0
919 punpckldq mm7, [r0+FENC_STRIDE]
920 punpckldq mm4, [r1+r5]
921 punpckldq mm5, [r2+r5]
928 punpckldq mm4, [r3+r5]
929 punpckldq mm5, [r4+r5]
936 %macro SAD_X4_2x16P 1
943 SAD_X4_1x8P FENC_STRIDE, r5
944 SAD_X4_1x8P FENC_STRIDE+8, r5+8
945 add r0, 2*FENC_STRIDE
958 SAD_X4_1x8P FENC_STRIDE, r5
959 add r0, 2*FENC_STRIDE
972 add r0, 2*FENC_STRIDE
1002 ;-----------------------------------------------------------------------------
1003 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1004 ; uint8_t *pix2, int i_stride, int scores[3] )
1005 ;-----------------------------------------------------------------------------
1007 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
1010 movsxd r %+ i, r %+ i %+ d
1037 ;=============================================================================
1039 ;=============================================================================
1041 %macro SAD_X3_START_1x16P_SSE2 0
1042 %if cpuflag(misalign)
1060 %macro SAD_X3_1x16P_SSE2 2
1061 %if cpuflag(misalign)
1067 psadbw xmm3, [r3+%2]
1085 %macro SAD_X3_2x16P_SSE2 1
1087 SAD_X3_START_1x16P_SSE2
1089 SAD_X3_1x16P_SSE2 0, 0
1091 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
1092 add r0, 2*FENC_STRIDE
1098 %macro SAD_X3_START_2x8P_SSE2 0
1103 movhps xmm7, [r0+FENC_STRIDE]
1104 movhps xmm0, [r1+r4]
1105 movhps xmm1, [r2+r4]
1106 movhps xmm2, [r3+r4]
1112 %macro SAD_X3_2x8P_SSE2 0
1117 movhps xmm7, [r0+FENC_STRIDE]
1118 movhps xmm3, [r1+r4]
1119 movhps xmm4, [r2+r4]
1120 movhps xmm5, [r3+r4]
1129 %macro SAD_X4_START_2x8P_SSE2 0
1135 movhps xmm7, [r0+FENC_STRIDE]
1136 movhps xmm0, [r1+r5]
1137 movhps xmm1, [r2+r5]
1138 movhps xmm2, [r3+r5]
1139 movhps xmm3, [r4+r5]
1146 %macro SAD_X4_2x8P_SSE2 0
1153 movhps xmm7, [r0+FENC_STRIDE]
1154 movhps xmm4, [r1+r5]
1155 movhps xmm5, [r2+r5]
1156 movhps xmm6, [r3+r5]
1157 movhps xmm8, [r4+r5]
1167 movhps xmm7, [r0+FENC_STRIDE]
1168 movhps xmm4, [r1+r5]
1169 movhps xmm5, [r2+r5]
1176 movhps xmm6, [r3+r5]
1177 movhps xmm4, [r4+r5]
1185 %macro SAD_X4_START_1x16P_SSE2 0
1186 %if cpuflag(misalign)
1208 %macro SAD_X4_1x16P_SSE2 2
1209 %if cpuflag(misalign)
1217 psadbw xmm7, [r4+%2]
1251 %macro SAD_X4_2x16P_SSE2 1
1253 SAD_X4_START_1x16P_SSE2
1255 SAD_X4_1x16P_SSE2 0, 0
1257 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
1258 add r0, 2*FENC_STRIDE
1265 %macro SAD_X3_2x8P_SSE2 1
1267 SAD_X3_START_2x8P_SSE2
1271 add r0, 2*FENC_STRIDE
1277 %macro SAD_X4_2x8P_SSE2 1
1279 SAD_X4_START_2x8P_SSE2
1283 add r0, 2*FENC_STRIDE
1290 %macro SAD_X3_END_SSE2 0
1310 %macro SAD_X4_END_SSE2 0
1325 ;-----------------------------------------------------------------------------
1326 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1327 ; uint8_t *pix2, int i_stride, int scores[3] )
1328 ;-----------------------------------------------------------------------------
1330 cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
1333 movsxd r %+ i, r %+ i %+ d
1335 SAD_X%1_2x%2P_SSE2 1
1337 SAD_X%1_2x%2P_SSE2 0
1343 SAD_X_SSE2 3, 16, 16
1348 SAD_X_SSE2 4, 16, 16
1354 INIT_XMM sse2, misalign
1355 SAD_X_SSE2 3, 16, 16
1357 SAD_X_SSE2 4, 16, 16
1361 SAD_X_SSE2 3, 16, 16
1363 SAD_X_SSE2 4, 16, 16
1368 ;=============================================================================
1369 ; SAD cacheline split
1370 ;=============================================================================
1372 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
1373 ; unless the unaligned data spans the border between 2 cachelines, in which
1374 ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
1375 ; to Nehalem have a large penalty for cacheline splits.
1376 ; (8-byte alignment exactly half way between two cachelines is ok though.)
1377 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
1378 ; So in the split case we load aligned data and explicitly perform the
1379 ; alignment between registers. Like on archs that have only aligned loads,
1380 ; except complicated by the fact that PALIGNR takes only an immediate, not
1381 ; a variable alignment.
1382 ; It is also possible to hoist the realignment to the macroblock level (keep
1383 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
1384 ; needed for that method makes it often slower.
1386 ; sad 16x16 costs on Core2:
1387 ; good offsets: 49 cycles (50/64 of all mvs)
1388 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
1389 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
1390 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
1392 ; computed jump assumes this loop is exactly 80 bytes
1393 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
1395 sad_w16_align%1_sse2:
1396 movdqa xmm1, [r2+16]
1397 movdqa xmm2, [r2+r3+16]
1399 movdqa xmm4, [r2+r3]
1407 psadbw xmm2, [r0+r1]
1413 jg sad_w16_align%1_sse2
1417 ; computed jump assumes this loop is exactly 64 bytes
1418 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
1420 sad_w16_align%1_ssse3:
1421 movdqa xmm1, [r2+16]
1422 movdqa xmm2, [r2+r3+16]
1423 palignr xmm1, [r2], %1
1424 palignr xmm2, [r2+r3], %1
1426 psadbw xmm2, [r0+r1]
1432 jg sad_w16_align%1_ssse3
1436 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
1437 cglobal pixel_sad_16x%2_cache64_%1
1441 jle pixel_sad_16x%2_sse2
1446 shl r4d, 6 ; code size = 64
1449 shl r4d, 4 ; code size = 80
1451 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
1453 lea r5, [sad_w16_addr]
1456 lea r5, [sad_w16_addr + r4]
1468 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
1470 and eax, 0x17|%1|(%4>>1)
1471 cmp eax, 0x10|%1|(%4>>1)
1472 jle pixel_sad_%1x%2_mmx2
1484 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1485 cglobal pixel_sad_16x%1_cache%2_mmx2
1486 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1510 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1511 cglobal pixel_sad_8x%1_cache%2_mmx2
1512 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1536 ; sad_x3/x4_cache64: check each mv.
1537 ; if they're all within a cacheline, use normal sad_x3/x4.
1538 ; otherwise, send them individually to sad_cache64.
1539 %macro CHECK_SPLIT 3 ; pix, width, cacheline
1541 and eax, 0x17|%2|(%3>>1)
1542 cmp eax, 0x10|%2|(%3>>1)
1546 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1547 cglobal pixel_sad_x3_%1x%2_cache%3_%6
1548 CHECK_SPLIT r1m, %1, %3
1549 CHECK_SPLIT r2m, %1, %3
1550 CHECK_SPLIT r3m, %1, %3
1551 jmp pixel_sad_x3_%1x%2_%4
1566 call pixel_sad_%1x%2_cache%3_%5
1574 call pixel_sad_%1x%2_cache%3_%5
1582 call pixel_sad_%1x%2_cache%3_%5
1595 call pixel_sad_%1x%2_cache%3_%5
1599 call pixel_sad_%1x%2_cache%3_%5
1603 call pixel_sad_%1x%2_cache%3_%5
1611 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1612 cglobal pixel_sad_x4_%1x%2_cache%3_%6
1613 CHECK_SPLIT r1m, %1, %3
1614 CHECK_SPLIT r2m, %1, %3
1615 CHECK_SPLIT r3m, %1, %3
1616 CHECK_SPLIT r4m, %1, %3
1617 jmp pixel_sad_x4_%1x%2_%4
1632 call pixel_sad_%1x%2_cache%3_%5
1640 call pixel_sad_%1x%2_cache%3_%5
1648 call pixel_sad_%1x%2_cache%3_%5
1656 call pixel_sad_%1x%2_cache%3_%5
1669 call pixel_sad_%1x%2_cache%3_%5
1673 call pixel_sad_%1x%2_cache%3_%5
1677 call pixel_sad_%1x%2_cache%3_%5
1681 call pixel_sad_%1x%2_cache%3_%5
1689 %macro SADX34_CACHELINE_FUNC 1+
1690 SADX3_CACHELINE_FUNC %1
1691 SADX4_CACHELINE_FUNC %1
1695 ; instantiate the aligned sads
1699 SAD16_CACHELINE_FUNC_MMX2 8, 32
1700 SAD16_CACHELINE_FUNC_MMX2 16, 32
1701 SAD8_CACHELINE_FUNC_MMX2 4, 32
1702 SAD8_CACHELINE_FUNC_MMX2 8, 32
1703 SAD8_CACHELINE_FUNC_MMX2 16, 32
1704 SAD16_CACHELINE_FUNC_MMX2 8, 64
1705 SAD16_CACHELINE_FUNC_MMX2 16, 64
1706 %endif ; !ARCH_X86_64
1707 SAD8_CACHELINE_FUNC_MMX2 4, 64
1708 SAD8_CACHELINE_FUNC_MMX2 8, 64
1709 SAD8_CACHELINE_FUNC_MMX2 16, 64
1712 SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
1713 SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
1714 SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
1715 SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
1716 SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
1717 SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
1718 %endif ; !ARCH_X86_64
1719 SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
1720 SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
1723 SAD16_CACHELINE_FUNC sse2, 8
1724 SAD16_CACHELINE_FUNC sse2, 16
1727 SAD16_CACHELINE_LOOP_SSE2 i
1730 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
1731 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
1732 %endif ; !ARCH_X86_64
1733 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
1735 SAD16_CACHELINE_FUNC ssse3, 8
1736 SAD16_CACHELINE_FUNC ssse3, 16
1739 SAD16_CACHELINE_LOOP_SSSE3 i
1742 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
1743 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3