1 ;*****************************************************************************
2 ;* sad-a.asm: x86 sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
35 h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
36 h8x8_pred_shuf: times 8 db 1
52 ;=============================================================================
54 ;=============================================================================
56 %macro SAD_INC_2x16P 0
87 punpckldq mm1, [r0+r1]
88 punpckldq mm2, [r2+r3]
95 ;-----------------------------------------------------------------------------
96 ; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
97 ;-----------------------------------------------------------------------------
99 cglobal pixel_sad_%1x%2_mmxext, 4,4
118 ;=============================================================================
120 ;=============================================================================
122 %macro SAD_END_SSE2 0
130 ;-----------------------------------------------------------------------------
131 ; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
132 ;-----------------------------------------------------------------------------
133 cglobal pixel_sad_16x16_%1, 4,4,8
197 ;-----------------------------------------------------------------------------
198 ; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
199 ;-----------------------------------------------------------------------------
200 cglobal pixel_sad_16x8_%1, 4,4
239 %define movdqu movdqa
243 %macro SAD_INC_4x8P_SSE 1
266 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
267 cglobal pixel_sad_8x16_sse2, 4,4
275 ;-----------------------------------------------------------------------------
276 ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
277 ;-----------------------------------------------------------------------------
279 cglobal intra_sad_x3_4x4_mmxext, 3,3
281 movd mm0, [r1-FDEC_STRIDE]
282 movd mm1, [r0+FENC_STRIDE*0]
283 movd mm2, [r0+FENC_STRIDE*2]
285 punpckldq mm1, [r0+FENC_STRIDE*1]
286 punpckldq mm2, [r0+FENC_STRIDE*3]
292 movd [r2], mm0 ;V prediction cost
293 movd mm3, [r1+FDEC_STRIDE*0-4]
294 movd mm0, [r1+FDEC_STRIDE*1-4]
295 movd mm4, [r1+FDEC_STRIDE*2-4]
296 movd mm5, [r1+FDEC_STRIDE*3-4]
310 pshufw mm5, mm5, 0x0 ;DC prediction
318 movd [r2+8], mm5 ;DC prediction cost
319 movd [r2+4], mm1 ;H prediction cost
322 cglobal intra_sad_x3_4x4_sse4, 3,3
323 movd xmm4, [r1+FDEC_STRIDE*0-4]
324 pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
325 pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
326 pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
327 movd xmm2, [r1-FDEC_STRIDE]
330 pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
331 pshufb xmm5, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
332 pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
333 punpckldq xmm2, xmm4 ; ABCDEFGH
335 movd xmm1, [r0+FENC_STRIDE*0]
336 pinsrd xmm1, [r0+FENC_STRIDE*1], 1
337 pinsrd xmm1, [r0+FENC_STRIDE*2], 2
338 pinsrd xmm1, [r0+FENC_STRIDE*3], 3
343 pshufb xmm2, xmm3 ; DC prediction
345 punpcklqdq xmm0, xmm5
346 punpckhqdq xmm3, xmm5
352 movq [r2], xmm0 ; V/H prediction costs
353 movd [r2+8], xmm2 ; DC prediction cost
356 ;-----------------------------------------------------------------------------
357 ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
358 ;-----------------------------------------------------------------------------
369 %macro INTRA_SAD_HVDC_ITER 2
370 movq m5, [r0+FENC_STRIDE*%1]
395 cglobal intra_sad_x3_8x8_mmxext, 3,3
398 movq m6, [r1+16] ;V prediction
406 pshufw m0, m0, 0x0 ;DC prediction
408 INTRA_SAD_HVDC_ITER 0, 0xff
409 INTRA_SAD_HVDC_ITER 1, 0xaa
410 INTRA_SAD_HVDC_ITER 2, 0x55
411 INTRA_SAD_HVDC_ITER 3, 0x00
414 INTRA_SAD_HVDC_ITER 4, 0xff
415 INTRA_SAD_HVDC_ITER 5, 0xaa
416 INTRA_SAD_HVDC_ITER 6, 0x55
417 INTRA_SAD_HVDC_ITER 7, 0x00
424 cglobal intra_sad_x3_8x8_ssse3, 3,4,9
426 lea r11, [h8x8_pred_shuf]
429 %define shuf h8x8_pred_shuf
431 movq m0, [r1+7] ; left pixels
432 movq m1, [r1+16] ; top pixels
438 pxor m3, m3 ; V score accumulator
441 punpcklqdq m1, m1 ; V prediction
442 pshufb m2, m3 ; DC prediction
443 pxor m4, m4 ; H score accumulator
444 pxor m5, m5 ; DC score accumulator
447 movq m6, [r0+FENC_STRIDE*0]
448 movhps m6, [r0+FENC_STRIDE*1]
450 pshufb m7, [shuf+r3*8] ; H prediction
468 add r0, FENC_STRIDE*2
483 ;-----------------------------------------------------------------------------
484 ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
485 ;-----------------------------------------------------------------------------
487 %macro INTRA_SAD_HV_ITER 2
489 movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
490 movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
494 movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
495 movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
501 movq m4, [r0 + FENC_STRIDE*(%1+0)]
502 movq m5, [r0 + FENC_STRIDE*(%1+1)]
518 %macro INTRA_SAD_8x8C 1
519 cglobal intra_sad_x3_8x8c_%1, 3,3
520 movq m6, [r1 - FDEC_STRIDE]
521 add r1, FDEC_STRIDE*4
525 INTRA_SAD_HV_ITER 0, %1
526 INTRA_SAD_HV_ITER 2, %1
527 INTRA_SAD_HV_ITER 4, %1
528 INTRA_SAD_HV_ITER 6, %1
532 movq m2, [r1 + FDEC_STRIDE*-4 - 8]
533 movq m4, [r1 + FDEC_STRIDE*-2 - 8]
534 movq m3, [r1 + FDEC_STRIDE* 0 - 8]
535 movq m5, [r1 + FDEC_STRIDE* 2 - 8]
536 punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
537 punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
538 punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
539 punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
554 punpckldq m0, m2 ;s0 s1 s2 s3
555 pshufw m3, m0, 11110110b ;s2,s1,s3,s3
556 pshufw m0, m0, 01110100b ;s0,s1,s3,s1
559 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
562 pshufb xmm0, [pb_shuf8x8c]
563 movq xmm1, [r0+FENC_STRIDE*0]
564 movq xmm2, [r0+FENC_STRIDE*1]
565 movq xmm3, [r0+FENC_STRIDE*2]
566 movq xmm4, [r0+FENC_STRIDE*3]
567 movhps xmm1, [r0+FENC_STRIDE*4]
568 movhps xmm2, [r0+FENC_STRIDE*5]
569 movhps xmm3, [r0+FENC_STRIDE*6]
570 movhps xmm4, [r0+FENC_STRIDE*7]
585 punpcklbw m0, m0 ; 4x dc0 4x dc1
586 punpckhbw m1, m1 ; 4x dc2 4x dc3
587 movq m2, [r0+FENC_STRIDE*0]
588 movq m3, [r0+FENC_STRIDE*1]
589 movq m4, [r0+FENC_STRIDE*2]
590 movq m5, [r0+FENC_STRIDE*3]
591 movq m6, [r0+FENC_STRIDE*4]
592 movq m7, [r0+FENC_STRIDE*5]
597 movq m0, [r0+FENC_STRIDE*6]
601 psadbw m1, [r0+FENC_STRIDE*7]
615 INTRA_SAD_8x8C mmxext
619 ;-----------------------------------------------------------------------------
620 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
621 ;-----------------------------------------------------------------------------
623 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
624 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
625 %macro INTRA_SAD16 1-2 0
626 cglobal intra_sad_x3_16x16_%1,3,5,%2
629 psadbw mm0, [r1-FDEC_STRIDE+0]
630 psadbw mm1, [r1-FDEC_STRIDE+8]
638 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
639 %if (x&3)==3 && x!=15
640 add r1, FDEC_STRIDE*4
645 sub r1, FDEC_STRIDE*12
650 mova m5, [r1-FDEC_STRIDE]
654 mova m1, [r1-FDEC_STRIDE+8]
660 mov r3d, 15*FENC_STRIDE
662 SPLATB m6, r1+r3*2-1, m1
685 add r3d, -FENC_STRIDE
704 %define SPLATB SPLATB_MMX
708 %define SPLATB SPLATB_SSSE3
713 ;=============================================================================
715 ;=============================================================================
717 %macro SAD_X3_START_1x8P 0
740 %macro SAD_X3_START_2x4P 3
745 punpckldq mm3, [r0+FENC_STRIDE]
746 punpckldq %1, [r1+r4]
747 punpckldq %2, [r2+r4]
748 punpckldq %3, [r3+r4]
754 %macro SAD_X3_2x16P 1
761 SAD_X3_1x8P FENC_STRIDE, r4
762 SAD_X3_1x8P FENC_STRIDE+8, r4+8
763 add r0, 2*FENC_STRIDE
775 SAD_X3_1x8P FENC_STRIDE, r4
776 add r0, 2*FENC_STRIDE
784 SAD_X3_START_2x4P mm0, mm1, mm2
786 SAD_X3_START_2x4P mm4, mm5, mm6
791 add r0, 2*FENC_STRIDE
797 %macro SAD_X4_START_1x8P 0
824 %macro SAD_X4_START_2x4P 0
830 punpckldq mm7, [r0+FENC_STRIDE]
831 punpckldq mm0, [r1+r5]
832 punpckldq mm1, [r2+r5]
833 punpckldq mm2, [r3+r5]
834 punpckldq mm3, [r4+r5]
841 %macro SAD_X4_INC_2x4P 0
845 punpckldq mm7, [r0+FENC_STRIDE]
846 punpckldq mm4, [r1+r5]
847 punpckldq mm5, [r2+r5]
854 punpckldq mm4, [r3+r5]
855 punpckldq mm5, [r4+r5]
862 %macro SAD_X4_2x16P 1
869 SAD_X4_1x8P FENC_STRIDE, r5
870 SAD_X4_1x8P FENC_STRIDE+8, r5+8
871 add r0, 2*FENC_STRIDE
884 SAD_X4_1x8P FENC_STRIDE, r5
885 add r0, 2*FENC_STRIDE
898 add r0, 2*FENC_STRIDE
928 ;-----------------------------------------------------------------------------
929 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
930 ; uint8_t *pix2, int i_stride, int scores[3] )
931 ;-----------------------------------------------------------------------------
933 cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
936 movsxd r %+ i, r %+ i %+ d
962 ;=============================================================================
964 ;=============================================================================
966 %macro SAD_X3_START_1x16P_SSE2 0
976 %macro SAD_X3_1x16P_SSE2 2
989 %macro SAD_X3_2x16P_SSE2 1
991 SAD_X3_START_1x16P_SSE2
993 SAD_X3_1x16P_SSE2 0, 0
995 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
996 add r0, 2*FENC_STRIDE
1002 %macro SAD_X3_START_2x8P_SSE2 0
1007 movhps xmm7, [r0+FENC_STRIDE]
1008 movhps xmm0, [r1+r4]
1009 movhps xmm1, [r2+r4]
1010 movhps xmm2, [r3+r4]
1016 %macro SAD_X3_2x8P_SSE2 0
1021 movhps xmm7, [r0+FENC_STRIDE]
1022 movhps xmm3, [r1+r4]
1023 movhps xmm4, [r2+r4]
1024 movhps xmm5, [r3+r4]
1033 %macro SAD_X4_START_2x8P_SSE2 0
1039 movhps xmm7, [r0+FENC_STRIDE]
1040 movhps xmm0, [r1+r5]
1041 movhps xmm1, [r2+r5]
1042 movhps xmm2, [r3+r5]
1043 movhps xmm3, [r4+r5]
1050 %macro SAD_X4_2x8P_SSE2 0
1057 movhps xmm7, [r0+FENC_STRIDE]
1058 movhps xmm4, [r1+r5]
1059 movhps xmm5, [r2+r5]
1060 movhps xmm6, [r3+r5]
1061 movhps xmm8, [r4+r5]
1071 movhps xmm7, [r0+FENC_STRIDE]
1072 movhps xmm4, [r1+r5]
1073 movhps xmm5, [r2+r5]
1080 movhps xmm6, [r3+r5]
1081 movhps xmm4, [r4+r5]
1089 %macro SAD_X4_START_1x16P_SSE2 0
1101 %macro SAD_X4_1x16P_SSE2 2
1102 movdqa xmm7, [r0+%1]
1103 movdqu xmm4, [r1+%2]
1104 movdqu xmm5, [r2+%2]
1105 movdqu xmm6, [r3+%2]
1107 movdqu xmm8, [r4+%2]
1121 movdqu xmm4, [r4+%2]
1129 %macro SAD_X4_2x16P_SSE2 1
1131 SAD_X4_START_1x16P_SSE2
1133 SAD_X4_1x16P_SSE2 0, 0
1135 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
1136 add r0, 2*FENC_STRIDE
1143 %macro SAD_X3_2x8P_SSE2 1
1145 SAD_X3_START_2x8P_SSE2
1149 add r0, 2*FENC_STRIDE
1155 %macro SAD_X4_2x8P_SSE2 1
1157 SAD_X4_START_2x8P_SSE2
1161 add r0, 2*FENC_STRIDE
1168 %macro SAD_X3_END_SSE2 0
1188 %macro SAD_X4_END_SSE2 0
1203 %macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
1212 %macro SAD_X3_1x16P_SSE2_MISALIGN 2
1213 movdqa xmm3, [r0+%1]
1214 movdqu xmm4, [r1+%2]
1215 movdqu xmm5, [r2+%2]
1218 psadbw xmm3, [r3+%2]
1224 %macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
1235 %macro SAD_X4_1x16P_SSE2_MISALIGN 2
1236 movdqa xmm7, [r0+%1]
1237 movdqu xmm4, [r1+%2]
1238 movdqu xmm5, [r2+%2]
1239 movdqu xmm6, [r3+%2]
1243 psadbw xmm7, [r4+%2]
1250 %macro SAD_X3_2x16P_SSE2_MISALIGN 1
1252 SAD_X3_START_1x16P_SSE2_MISALIGN
1254 SAD_X3_1x16P_SSE2_MISALIGN 0, 0
1256 SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
1257 add r0, 2*FENC_STRIDE
1263 %macro SAD_X4_2x16P_SSE2_MISALIGN 1
1265 SAD_X4_START_1x16P_SSE2_MISALIGN
1267 SAD_X4_1x16P_SSE2_MISALIGN 0, 0
1269 SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
1270 add r0, 2*FENC_STRIDE
1277 ;-----------------------------------------------------------------------------
1278 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1279 ; uint8_t *pix2, int i_stride, int scores[3] )
1280 ;-----------------------------------------------------------------------------
1282 cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
1285 movsxd r %+ i, r %+ i %+ d
1287 SAD_X%1_2x%2P_SSE2 1
1289 SAD_X%1_2x%2P_SSE2 0
1294 %macro SAD_X_SSE2_MISALIGN 4
1295 cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
1298 movsxd r %+ i, r %+ i %+ d
1300 SAD_X%1_2x%2P_SSE2_MISALIGN 1
1302 SAD_X%1_2x%2P_SSE2_MISALIGN 0
1307 SAD_X_SSE2 3, 16, 16, sse2
1308 SAD_X_SSE2 3, 16, 8, sse2
1309 SAD_X_SSE2 3, 8, 16, sse2
1310 SAD_X_SSE2 3, 8, 8, sse2
1311 SAD_X_SSE2 3, 8, 4, sse2
1312 SAD_X_SSE2 4, 16, 16, sse2
1313 SAD_X_SSE2 4, 16, 8, sse2
1314 SAD_X_SSE2 4, 8, 16, sse2
1315 SAD_X_SSE2 4, 8, 8, sse2
1316 SAD_X_SSE2 4, 8, 4, sse2
1318 SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
1319 SAD_X_SSE2_MISALIGN 3, 16, 8, sse2
1320 SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
1321 SAD_X_SSE2_MISALIGN 4, 16, 8, sse2
1323 %define movdqu lddqu
1324 SAD_X_SSE2 3, 16, 16, sse3
1325 SAD_X_SSE2 3, 16, 8, sse3
1326 SAD_X_SSE2 4, 16, 16, sse3
1327 SAD_X_SSE2 4, 16, 8, sse3
1332 ;=============================================================================
1333 ; SAD cacheline split
1334 ;=============================================================================
1336 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
1337 ; unless the unaligned data spans the border between 2 cachelines, in which
1338 ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
1339 ; to Nehalem have a large penalty for cacheline splits.
1340 ; (8-byte alignment exactly half way between two cachelines is ok though.)
1341 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
1342 ; So in the split case we load aligned data and explicitly perform the
1343 ; alignment between registers. Like on archs that have only aligned loads,
1344 ; except complicated by the fact that PALIGNR takes only an immediate, not
1345 ; a variable alignment.
1346 ; It is also possible to hoist the realignment to the macroblock level (keep
1347 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
1348 ; needed for that method makes it often slower.
1350 ; sad 16x16 costs on Core2:
1351 ; good offsets: 49 cycles (50/64 of all mvs)
1352 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
1353 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
1354 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
1356 ; computed jump assumes this loop is exactly 80 bytes
1357 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
1359 sad_w16_align%1_sse2:
1360 movdqa xmm1, [r2+16]
1361 movdqa xmm2, [r2+r3+16]
1363 movdqa xmm4, [r2+r3]
1371 psadbw xmm2, [r0+r1]
1377 jg sad_w16_align%1_sse2
1381 ; computed jump assumes this loop is exactly 64 bytes
1382 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
1384 sad_w16_align%1_ssse3:
1385 movdqa xmm1, [r2+16]
1386 movdqa xmm2, [r2+r3+16]
1387 palignr xmm1, [r2], %1
1388 palignr xmm2, [r2+r3], %1
1390 psadbw xmm2, [r0+r1]
1396 jg sad_w16_align%1_ssse3
1400 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
1401 cglobal pixel_sad_16x%2_cache64_%1
1405 jle pixel_sad_16x%2_sse2
1410 shl r4d, 6 ; code size = 64
1413 shl r4d, 4 ; code size = 80
1415 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
1417 lea r5, [sad_w16_addr]
1420 lea r5, [sad_w16_addr + r4]
1432 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
1434 and eax, 0x17|%1|(%4>>1)
1435 cmp eax, 0x10|%1|(%4>>1)
1436 jle pixel_sad_%1x%2_mmxext
1448 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1449 cglobal pixel_sad_16x%1_cache%2_mmxext
1450 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1474 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1475 cglobal pixel_sad_8x%1_cache%2_mmxext
1476 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1500 ; sad_x3/x4_cache64: check each mv.
1501 ; if they're all within a cacheline, use normal sad_x3/x4.
1502 ; otherwise, send them individually to sad_cache64.
1503 %macro CHECK_SPLIT 3 ; pix, width, cacheline
1505 and eax, 0x17|%2|(%3>>1)
1506 cmp eax, 0x10|%2|(%3>>1)
1510 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1511 cglobal pixel_sad_x3_%1x%2_cache%3_%6
1512 CHECK_SPLIT r1m, %1, %3
1513 CHECK_SPLIT r2m, %1, %3
1514 CHECK_SPLIT r3m, %1, %3
1515 jmp pixel_sad_x3_%1x%2_%4
1530 call pixel_sad_%1x%2_cache%3_%5
1538 call pixel_sad_%1x%2_cache%3_%5
1546 call pixel_sad_%1x%2_cache%3_%5
1559 call pixel_sad_%1x%2_cache%3_%5
1563 call pixel_sad_%1x%2_cache%3_%5
1567 call pixel_sad_%1x%2_cache%3_%5
1575 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1576 cglobal pixel_sad_x4_%1x%2_cache%3_%6
1577 CHECK_SPLIT r1m, %1, %3
1578 CHECK_SPLIT r2m, %1, %3
1579 CHECK_SPLIT r3m, %1, %3
1580 CHECK_SPLIT r4m, %1, %3
1581 jmp pixel_sad_x4_%1x%2_%4
1596 call pixel_sad_%1x%2_cache%3_%5
1604 call pixel_sad_%1x%2_cache%3_%5
1612 call pixel_sad_%1x%2_cache%3_%5
1620 call pixel_sad_%1x%2_cache%3_%5
1633 call pixel_sad_%1x%2_cache%3_%5
1637 call pixel_sad_%1x%2_cache%3_%5
1641 call pixel_sad_%1x%2_cache%3_%5
1645 call pixel_sad_%1x%2_cache%3_%5
1653 %macro SADX34_CACHELINE_FUNC 1+
1654 SADX3_CACHELINE_FUNC %1
1655 SADX4_CACHELINE_FUNC %1
1659 ; instantiate the aligned sads
1662 SAD16_CACHELINE_FUNC_MMX2 8, 32
1663 SAD16_CACHELINE_FUNC_MMX2 16, 32
1664 SAD8_CACHELINE_FUNC_MMX2 4, 32
1665 SAD8_CACHELINE_FUNC_MMX2 8, 32
1666 SAD8_CACHELINE_FUNC_MMX2 16, 32
1667 SAD16_CACHELINE_FUNC_MMX2 8, 64
1668 SAD16_CACHELINE_FUNC_MMX2 16, 64
1669 %endif ; !ARCH_X86_64
1670 SAD8_CACHELINE_FUNC_MMX2 4, 64
1671 SAD8_CACHELINE_FUNC_MMX2 8, 64
1672 SAD8_CACHELINE_FUNC_MMX2 16, 64
1675 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
1676 SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext
1677 SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext
1678 SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext
1679 SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
1680 SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext
1681 %endif ; !ARCH_X86_64
1682 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext
1683 SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext
1686 SAD16_CACHELINE_FUNC sse2, 8
1687 SAD16_CACHELINE_FUNC sse2, 16
1690 SAD16_CACHELINE_LOOP_SSE2 i
1693 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
1694 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
1695 %endif ; !ARCH_X86_64
1696 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2
1698 SAD16_CACHELINE_FUNC ssse3, 8
1699 SAD16_CACHELINE_FUNC ssse3, 16
1702 SAD16_CACHELINE_LOOP_SSSE3 i
1705 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
1706 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3