1 ;*****************************************************************************
2 ;* sad-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
37 ;=============================================================================
39 ;=============================================================================
41 %macro SAD_INC_2x16P 0
72 punpckldq mm1, [r0+r1]
73 punpckldq mm2, [r2+r3]
80 ;-----------------------------------------------------------------------------
81 ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
82 ;-----------------------------------------------------------------------------
84 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
103 ;=============================================================================
105 ;=============================================================================
107 %macro SAD_END_SSE2 0
115 ;-----------------------------------------------------------------------------
116 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
117 ;-----------------------------------------------------------------------------
118 cglobal x264_pixel_sad_16x16_%1, 4,4,8
182 ;-----------------------------------------------------------------------------
183 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
184 ;-----------------------------------------------------------------------------
185 cglobal x264_pixel_sad_16x8_%1, 4,4
224 %define movdqu movdqa
228 %macro SAD_INC_4x8P_SSE 1
251 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
252 cglobal x264_pixel_sad_8x16_sse2, 4,4
260 ;-----------------------------------------------------------------------------
261 ; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
262 ;-----------------------------------------------------------------------------
264 cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
266 movd mm0, [r1-FDEC_STRIDE]
267 movd mm1, [r0+FENC_STRIDE*0]
268 movd mm2, [r0+FENC_STRIDE*2]
270 punpckldq mm1, [r0+FENC_STRIDE*1]
271 punpckldq mm2, [r0+FENC_STRIDE*3]
277 movd [r2], mm0 ;V prediction cost
278 movd mm3, [r1+FDEC_STRIDE*0-4]
279 movd mm0, [r1+FDEC_STRIDE*1-4]
280 movd mm4, [r1+FDEC_STRIDE*2-4]
281 movd mm5, [r1+FDEC_STRIDE*3-4]
295 pshufw mm5, mm5, 0x0 ;DC prediction
303 movd [r2+8], mm5 ;DC prediction cost
304 movd [r2+4], mm1 ;H prediction cost
307 ;-----------------------------------------------------------------------------
308 ; void intra_sad_x3_8x8 ( uint8_t *fenc, uint8_t edge[33], int res[3]);
309 ;-----------------------------------------------------------------------------
320 %macro INTRA_SAD_HVDC_ITER 2
321 movq m5, [r0+FENC_STRIDE*%1]
346 cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
349 movq m6, [r1+16] ;V prediction
354 paddw m0, [pw_8 GLOBAL]
357 pshufw m0, m0, 0x0 ;DC prediction
359 INTRA_SAD_HVDC_ITER 0, 0xff
360 INTRA_SAD_HVDC_ITER 1, 0xaa
361 INTRA_SAD_HVDC_ITER 2, 0x55
362 INTRA_SAD_HVDC_ITER 3, 0x00
365 INTRA_SAD_HVDC_ITER 4, 0xff
366 INTRA_SAD_HVDC_ITER 5, 0xaa
367 INTRA_SAD_HVDC_ITER 6, 0x55
368 INTRA_SAD_HVDC_ITER 7, 0x00
374 ;-----------------------------------------------------------------------------
375 ; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
376 ;-----------------------------------------------------------------------------
378 %macro INTRA_SAD_HV_ITER 2
380 movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
381 movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
385 movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
386 movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
392 movq m4, [r0 + FENC_STRIDE*(%1+0)]
393 movq m5, [r0 + FENC_STRIDE*(%1+1)]
409 %macro INTRA_SAD_8x8C 1
410 cglobal x264_intra_sad_x3_8x8c_%1, 3,3
411 movq m6, [r1 - FDEC_STRIDE]
412 add r1, FDEC_STRIDE*4
414 movq m7, [pb_3 GLOBAL]
416 INTRA_SAD_HV_ITER 0, %1
417 INTRA_SAD_HV_ITER 2, %1
418 INTRA_SAD_HV_ITER 4, %1
419 INTRA_SAD_HV_ITER 6, %1
423 movq m2, [r1 + FDEC_STRIDE*-4 - 8]
424 movq m4, [r1 + FDEC_STRIDE*-2 - 8]
425 movq m3, [r1 + FDEC_STRIDE* 0 - 8]
426 movq m5, [r1 + FDEC_STRIDE* 2 - 8]
427 punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
428 punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
429 punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
430 punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
445 punpckldq m0, m2 ;s0 s1 s2 s3
446 pshufw m3, m0, 11110110b ;s2,s1,s3,s3
447 pshufw m0, m0, 01110100b ;s0,s1,s3,s1
450 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
453 pshufb xmm0, [pb_shuf8x8c GLOBAL]
454 movq xmm1, [r0+FENC_STRIDE*0]
455 movq xmm2, [r0+FENC_STRIDE*1]
456 movq xmm3, [r0+FENC_STRIDE*2]
457 movq xmm4, [r0+FENC_STRIDE*3]
458 movhps xmm1, [r0+FENC_STRIDE*4]
459 movhps xmm2, [r0+FENC_STRIDE*5]
460 movhps xmm3, [r0+FENC_STRIDE*6]
461 movhps xmm4, [r0+FENC_STRIDE*7]
476 punpcklbw m0, m0 ; 4x dc0 4x dc1
477 punpckhbw m1, m1 ; 4x dc2 4x dc3
478 movq m2, [r0+FENC_STRIDE*0]
479 movq m3, [r0+FENC_STRIDE*1]
480 movq m4, [r0+FENC_STRIDE*2]
481 movq m5, [r0+FENC_STRIDE*3]
482 movq m6, [r0+FENC_STRIDE*4]
483 movq m7, [r0+FENC_STRIDE*5]
488 movq m0, [r0+FENC_STRIDE*6]
492 psadbw m1, [r0+FENC_STRIDE*7]
506 INTRA_SAD_8x8C mmxext
510 ;-----------------------------------------------------------------------------
511 ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
512 ;-----------------------------------------------------------------------------
514 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
515 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
516 %macro INTRA_SAD16 1-2 0
517 cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
520 psadbw mm0, [r1-FDEC_STRIDE+0]
521 psadbw mm1, [r1-FDEC_STRIDE+8]
525 mova m1, [pb_3 GLOBAL]
529 movzx r4d, byte [r1-1+FDEC_STRIDE*x]
537 mova m5, [r1-FDEC_STRIDE]
541 mova m1, [r1-FDEC_STRIDE+8]
547 mov r3d, 15*FENC_STRIDE
549 SPLATB m6, r1+r3*2-1, m1
572 add r3d, -FENC_STRIDE
591 %define SPLATB SPLATB_MMX
595 %define SPLATB SPLATB_SSSE3
600 ;=============================================================================
602 ;=============================================================================
604 %macro SAD_X3_START_1x8P 0
627 %macro SAD_X3_START_2x4P 3
632 punpckldq mm3, [r0+FENC_STRIDE]
633 punpckldq %1, [r1+r4]
634 punpckldq %2, [r2+r4]
635 punpckldq %3, [r3+r4]
641 %macro SAD_X3_2x16P 1
648 SAD_X3_1x8P FENC_STRIDE, r4
649 SAD_X3_1x8P FENC_STRIDE+8, r4+8
650 add r0, 2*FENC_STRIDE
662 SAD_X3_1x8P FENC_STRIDE, r4
663 add r0, 2*FENC_STRIDE
671 SAD_X3_START_2x4P mm0, mm1, mm2
673 SAD_X3_START_2x4P mm4, mm5, mm6
678 add r0, 2*FENC_STRIDE
684 %macro SAD_X4_START_1x8P 0
711 %macro SAD_X4_START_2x4P 0
717 punpckldq mm7, [r0+FENC_STRIDE]
718 punpckldq mm0, [r1+r5]
719 punpckldq mm1, [r2+r5]
720 punpckldq mm2, [r3+r5]
721 punpckldq mm3, [r4+r5]
728 %macro SAD_X4_INC_2x4P 0
732 punpckldq mm7, [r0+FENC_STRIDE]
733 punpckldq mm4, [r1+r5]
734 punpckldq mm5, [r2+r5]
741 punpckldq mm4, [r3+r5]
742 punpckldq mm5, [r4+r5]
749 %macro SAD_X4_2x16P 1
756 SAD_X4_1x8P FENC_STRIDE, r5
757 SAD_X4_1x8P FENC_STRIDE+8, r5+8
758 add r0, 2*FENC_STRIDE
771 SAD_X4_1x8P FENC_STRIDE, r5
772 add r0, 2*FENC_STRIDE
785 add r0, 2*FENC_STRIDE
815 ;-----------------------------------------------------------------------------
816 ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
817 ; uint8_t *pix2, int i_stride, int scores[3] )
818 ;-----------------------------------------------------------------------------
820 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
823 movsxd r %+ i, r %+ i %+ d
849 ;=============================================================================
851 ;=============================================================================
853 %macro SAD_X3_START_1x16P_SSE2 0
863 %macro SAD_X3_1x16P_SSE2 2
876 %macro SAD_X3_2x16P_SSE2 1
878 SAD_X3_START_1x16P_SSE2
880 SAD_X3_1x16P_SSE2 0, 0
882 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
883 add r0, 2*FENC_STRIDE
889 %macro SAD_X3_START_2x8P_SSE2 0
894 movhps xmm7, [r0+FENC_STRIDE]
903 %macro SAD_X3_2x8P_SSE2 0
908 movhps xmm7, [r0+FENC_STRIDE]
920 %macro SAD_X4_START_2x8P_SSE2 0
926 movhps xmm7, [r0+FENC_STRIDE]
937 %macro SAD_X4_2x8P_SSE2 0
944 movhps xmm7, [r0+FENC_STRIDE]
958 movhps xmm7, [r0+FENC_STRIDE]
976 %macro SAD_X4_START_1x16P_SSE2 0
988 %macro SAD_X4_1x16P_SSE2 2
1008 movdqu xmm4, [r4+%2]
1016 %macro SAD_X4_2x16P_SSE2 1
1018 SAD_X4_START_1x16P_SSE2
1020 SAD_X4_1x16P_SSE2 0, 0
1022 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
1023 add r0, 2*FENC_STRIDE
1030 %macro SAD_X3_2x8P_SSE2 1
1032 SAD_X3_START_2x8P_SSE2
1036 add r0, 2*FENC_STRIDE
1042 %macro SAD_X4_2x8P_SSE2 1
1044 SAD_X4_START_2x8P_SSE2
1048 add r0, 2*FENC_STRIDE
1055 %macro SAD_X3_END_SSE2 0
1075 %macro SAD_X4_END_SSE2 0
1090 %macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
1099 %macro SAD_X3_1x16P_SSE2_MISALIGN 2
1100 movdqa xmm3, [r0+%1]
1101 movdqu xmm4, [r1+%2]
1102 movdqu xmm5, [r2+%2]
1105 psadbw xmm3, [r3+%2]
1111 %macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
1122 %macro SAD_X4_1x16P_SSE2_MISALIGN 2
1123 movdqa xmm7, [r0+%1]
1124 movdqu xmm4, [r1+%2]
1125 movdqu xmm5, [r2+%2]
1126 movdqu xmm6, [r3+%2]
1130 psadbw xmm7, [r4+%2]
1137 %macro SAD_X3_2x16P_SSE2_MISALIGN 1
1139 SAD_X3_START_1x16P_SSE2_MISALIGN
1141 SAD_X3_1x16P_SSE2_MISALIGN 0, 0
1143 SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
1144 add r0, 2*FENC_STRIDE
1150 %macro SAD_X4_2x16P_SSE2_MISALIGN 1
1152 SAD_X4_START_1x16P_SSE2_MISALIGN
1154 SAD_X4_1x16P_SSE2_MISALIGN 0, 0
1156 SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
1157 add r0, 2*FENC_STRIDE
1164 ;-----------------------------------------------------------------------------
1165 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1166 ; uint8_t *pix2, int i_stride, int scores[3] )
1167 ;-----------------------------------------------------------------------------
1169 cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
1172 movsxd r %+ i, r %+ i %+ d
1174 SAD_X%1_2x%2P_SSE2 1
1176 SAD_X%1_2x%2P_SSE2 0
1181 %macro SAD_X_SSE2_MISALIGN 4
1182 cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
1185 movsxd r %+ i, r %+ i %+ d
1187 SAD_X%1_2x%2P_SSE2_MISALIGN 1
1189 SAD_X%1_2x%2P_SSE2_MISALIGN 0
1194 SAD_X_SSE2 3, 16, 16, sse2
1195 SAD_X_SSE2 3, 16, 8, sse2
1196 SAD_X_SSE2 3, 8, 16, sse2
1197 SAD_X_SSE2 3, 8, 8, sse2
1198 SAD_X_SSE2 3, 8, 4, sse2
1199 SAD_X_SSE2 4, 16, 16, sse2
1200 SAD_X_SSE2 4, 16, 8, sse2
1201 SAD_X_SSE2 4, 8, 16, sse2
1202 SAD_X_SSE2 4, 8, 8, sse2
1203 SAD_X_SSE2 4, 8, 4, sse2
1205 SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
1206 SAD_X_SSE2_MISALIGN 3, 16, 8, sse2
1207 SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
1208 SAD_X_SSE2_MISALIGN 4, 16, 8, sse2
1210 %define movdqu lddqu
1211 SAD_X_SSE2 3, 16, 16, sse3
1212 SAD_X_SSE2 3, 16, 8, sse3
1213 SAD_X_SSE2 4, 16, 16, sse3
1214 SAD_X_SSE2 4, 16, 8, sse3
1219 ;=============================================================================
1220 ; SAD cacheline split
1221 ;=============================================================================
1223 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
1224 ; unless the unaligned data spans the border between 2 cachelines, in which
1225 ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
1226 ; to Nehalem have a large penalty for cacheline splits.
1227 ; (8-byte alignment exactly half way between two cachelines is ok though.)
1228 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
1229 ; So in the split case we load aligned data and explicitly perform the
1230 ; alignment between registers. Like on archs that have only aligned loads,
1231 ; except complicated by the fact that PALIGNR takes only an immediate, not
1232 ; a variable alignment.
1233 ; It is also possible to hoist the realignment to the macroblock level (keep
1234 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
1235 ; needed for that method makes it often slower.
1237 ; sad 16x16 costs on Core2:
1238 ; good offsets: 49 cycles (50/64 of all mvs)
1239 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
1240 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
1241 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
1243 ; computed jump assumes this loop is exactly 80 bytes
1244 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
1246 sad_w16_align%1_sse2:
1247 movdqa xmm1, [r2+16]
1248 movdqa xmm2, [r2+r3+16]
1250 movdqa xmm4, [r2+r3]
1258 psadbw xmm2, [r0+r1]
1264 jg sad_w16_align%1_sse2
1268 ; computed jump assumes this loop is exactly 64 bytes
1269 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
1271 sad_w16_align%1_ssse3:
1272 movdqa xmm1, [r2+16]
1273 movdqa xmm2, [r2+r3+16]
1274 palignr xmm1, [r2], %1
1275 palignr xmm2, [r2+r3], %1
1277 psadbw xmm2, [r0+r1]
1283 jg sad_w16_align%1_ssse3
1287 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
1288 cglobal x264_pixel_sad_16x%2_cache64_%1
1292 jle x264_pixel_sad_16x%2_sse2
1297 shl r4d, 6 ; code size = 64
1300 shl r4d, 4 ; code size = 80
1302 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
1304 lea r5, [sad_w16_addr GLOBAL]
1307 lea r5, [sad_w16_addr + r4 GLOBAL]
1319 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
1321 and eax, 0x17|%1|(%4>>1)
1322 cmp eax, 0x10|%1|(%4>>1)
1323 jle x264_pixel_sad_%1x%2_mmxext
1326 movd mm6, [sw_64 GLOBAL]
1335 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1336 cglobal x264_pixel_sad_16x%1_cache%2_mmxext
1337 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1361 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1362 cglobal x264_pixel_sad_8x%1_cache%2_mmxext
1363 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1387 ; sad_x3/x4_cache64: check each mv.
1388 ; if they're all within a cacheline, use normal sad_x3/x4.
1389 ; otherwise, send them individually to sad_cache64.
1390 %macro CHECK_SPLIT 3 ; pix, width, cacheline
1392 and eax, 0x17|%2|(%3>>1)
1393 cmp eax, 0x10|%2|(%3>>1)
1397 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1398 cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
1399 CHECK_SPLIT r1m, %1, %3
1400 CHECK_SPLIT r2m, %1, %3
1401 CHECK_SPLIT r3m, %1, %3
1402 jmp x264_pixel_sad_x3_%1x%2_%4
1417 call x264_pixel_sad_%1x%2_cache%3_%5
1425 call x264_pixel_sad_%1x%2_cache%3_%5
1433 call x264_pixel_sad_%1x%2_cache%3_%5
1446 call x264_pixel_sad_%1x%2_cache%3_%5
1450 call x264_pixel_sad_%1x%2_cache%3_%5
1454 call x264_pixel_sad_%1x%2_cache%3_%5
1462 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1463 cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
1464 CHECK_SPLIT r1m, %1, %3
1465 CHECK_SPLIT r2m, %1, %3
1466 CHECK_SPLIT r3m, %1, %3
1467 CHECK_SPLIT r4m, %1, %3
1468 jmp x264_pixel_sad_x4_%1x%2_%4
1483 call x264_pixel_sad_%1x%2_cache%3_%5
1491 call x264_pixel_sad_%1x%2_cache%3_%5
1499 call x264_pixel_sad_%1x%2_cache%3_%5
1507 call x264_pixel_sad_%1x%2_cache%3_%5
1520 call x264_pixel_sad_%1x%2_cache%3_%5
1524 call x264_pixel_sad_%1x%2_cache%3_%5
1528 call x264_pixel_sad_%1x%2_cache%3_%5
1532 call x264_pixel_sad_%1x%2_cache%3_%5
1540 %macro SADX34_CACHELINE_FUNC 1+
1541 SADX3_CACHELINE_FUNC %1
1542 SADX4_CACHELINE_FUNC %1
1546 ; instantiate the aligned sads
1549 SAD16_CACHELINE_FUNC_MMX2 8, 32
1550 SAD16_CACHELINE_FUNC_MMX2 16, 32
1551 SAD8_CACHELINE_FUNC_MMX2 4, 32
1552 SAD8_CACHELINE_FUNC_MMX2 8, 32
1553 SAD8_CACHELINE_FUNC_MMX2 16, 32
1554 SAD16_CACHELINE_FUNC_MMX2 8, 64
1555 SAD16_CACHELINE_FUNC_MMX2 16, 64
1556 %endif ; !ARCH_X86_64
1557 SAD8_CACHELINE_FUNC_MMX2 4, 64
1558 SAD8_CACHELINE_FUNC_MMX2 8, 64
1559 SAD8_CACHELINE_FUNC_MMX2 16, 64
1562 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
1563 SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext
1564 SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext
1565 SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext
1566 SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
1567 SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext
1568 %endif ; !ARCH_X86_64
1569 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext
1570 SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext
1573 SAD16_CACHELINE_FUNC sse2, 8
1574 SAD16_CACHELINE_FUNC sse2, 16
1577 SAD16_CACHELINE_LOOP_SSE2 i
1580 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
1581 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
1582 %endif ; !ARCH_X86_64
1583 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2
1585 SAD16_CACHELINE_FUNC ssse3, 8
1586 SAD16_CACHELINE_FUNC ssse3, 16
1589 SAD16_CACHELINE_LOOP_SSSE3 i
1592 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
1593 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3