1 ;*****************************************************************************
2 ;* sad-a.asm: x86 sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2015 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
35 hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
44 ;=============================================================================
46 ;=============================================================================
48 %macro SAD_INC_2x16P 0
79 punpckldq mm1, [r0+r1]
80 punpckldq mm2, [r2+r3]
87 ;-----------------------------------------------------------------------------
88 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
89 ;-----------------------------------------------------------------------------
91 cglobal pixel_sad_%1x%2_mmx2, 4,4
111 ;=============================================================================
113 ;=============================================================================
115 %macro SAD_END_SSE2 0
123 ;-----------------------------------------------------------------------------
124 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
125 ;-----------------------------------------------------------------------------
126 cglobal pixel_sad_16x16, 4,4,8
190 ;-----------------------------------------------------------------------------
191 ; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
192 ;-----------------------------------------------------------------------------
193 cglobal pixel_sad_16x8, 4,4
232 INIT_XMM sse2, aligned
235 %macro SAD_INC_4x8P_SSE 1
250 ACCUM paddw, 0, 1, %1
255 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
256 cglobal pixel_sad_8x16_sse2, 4,4
264 ;-----------------------------------------------------------------------------
265 ; void pixel_vsad( pixel *src, intptr_t stride );
266 ;-----------------------------------------------------------------------------
270 cglobal pixel_vsad_mmx2, 3,3
291 ;max sum: 31*16*255(pixel_max)=126480
306 cglobal pixel_vsad_sse2, 3,3
326 ;max sum: 31*16*255(pixel_max)=126480
331 ;-----------------------------------------------------------------------------
332 ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
333 ;-----------------------------------------------------------------------------
335 cglobal intra_sad_x3_4x4_mmx2, 3,3
337 movd mm0, [r1-FDEC_STRIDE]
338 movd mm1, [r0+FENC_STRIDE*0]
339 movd mm2, [r0+FENC_STRIDE*2]
341 punpckldq mm1, [r0+FENC_STRIDE*1]
342 punpckldq mm2, [r0+FENC_STRIDE*3]
348 movd [r2], mm0 ;V prediction cost
349 movd mm3, [r1+FDEC_STRIDE*0-4]
350 movd mm0, [r1+FDEC_STRIDE*1-4]
351 movd mm4, [r1+FDEC_STRIDE*2-4]
352 movd mm5, [r1+FDEC_STRIDE*3-4]
366 pshufw mm5, mm5, 0 ;DC prediction
374 movd [r2+8], mm5 ;DC prediction cost
375 movd [r2+4], mm1 ;H prediction cost
378 ;-----------------------------------------------------------------------------
379 ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
380 ;-----------------------------------------------------------------------------
391 %macro INTRA_SAD_HVDC_ITER 2
392 movq m5, [r0+FENC_STRIDE*%1]
395 ACCUM paddw, 1, 4, %1
398 ACCUM paddw, 2, 4, %1
401 ACCUM paddw, 3, 5, %1
405 cglobal intra_sad_x3_8x8_mmx2, 3,3
408 movq m6, [r1+16] ;V prediction
416 pshufw m0, m0, q0000 ;DC prediction
418 INTRA_SAD_HVDC_ITER 0, q3333
419 INTRA_SAD_HVDC_ITER 1, q2222
420 INTRA_SAD_HVDC_ITER 2, q1111
421 INTRA_SAD_HVDC_ITER 3, q0000
424 INTRA_SAD_HVDC_ITER 4, q3333
425 INTRA_SAD_HVDC_ITER 5, q2222
426 INTRA_SAD_HVDC_ITER 6, q1111
427 INTRA_SAD_HVDC_ITER 7, q0000
433 ;-----------------------------------------------------------------------------
434 ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
435 ;-----------------------------------------------------------------------------
437 %macro INTRA_SAD_HV_ITER 1
439 movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
440 movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
444 movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
445 movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
451 movq m4, [r0 + FENC_STRIDE*(%1+0)]
452 movq m5, [r0 + FENC_STRIDE*(%1+1)]
459 ACCUM paddw, 0, 1, %1
460 ACCUM paddw, 2, 4, %1
463 %macro INTRA_SAD_8x8C 0
464 cglobal intra_sad_x3_8x8c, 3,3
465 movq m6, [r1 - FDEC_STRIDE]
466 add r1, FDEC_STRIDE*4
477 movq m2, [r1 + FDEC_STRIDE*-4 - 8]
478 movq m4, [r1 + FDEC_STRIDE*-2 - 8]
479 movq m3, [r1 + FDEC_STRIDE* 0 - 8]
480 movq m5, [r1 + FDEC_STRIDE* 2 - 8]
481 punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
482 punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
483 punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
484 punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
499 punpckldq m0, m2 ;s0 s1 s2 s3
500 pshufw m3, m0, q3312 ;s2,s1,s3,s3
501 pshufw m0, m0, q1310 ;s0,s1,s3,s1
504 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
507 pshufb xmm0, [pb_shuf8x8c]
508 movq xmm1, [r0+FENC_STRIDE*0]
509 movq xmm2, [r0+FENC_STRIDE*1]
510 movq xmm3, [r0+FENC_STRIDE*2]
511 movq xmm4, [r0+FENC_STRIDE*3]
512 movhps xmm1, [r0+FENC_STRIDE*4]
513 movhps xmm2, [r0+FENC_STRIDE*5]
514 movhps xmm3, [r0+FENC_STRIDE*6]
515 movhps xmm4, [r0+FENC_STRIDE*7]
530 punpcklbw m0, m0 ; 4x dc0 4x dc1
531 punpckhbw m1, m1 ; 4x dc2 4x dc3
532 movq m2, [r0+FENC_STRIDE*0]
533 movq m3, [r0+FENC_STRIDE*1]
534 movq m4, [r0+FENC_STRIDE*2]
535 movq m5, [r0+FENC_STRIDE*3]
536 movq m6, [r0+FENC_STRIDE*4]
537 movq m7, [r0+FENC_STRIDE*5]
542 movq m0, [r0+FENC_STRIDE*6]
546 psadbw m1, [r0+FENC_STRIDE*7]
565 cglobal intra_sad_x3_8x8c, 3,3,7
566 vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
567 add r1, FDEC_STRIDE*4-1
569 punpckldq xm3, xm2, xm5 ; V0 _ V1 _
570 movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
571 movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
572 pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
573 pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
574 pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
575 pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
576 pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
577 pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
578 punpcklqdq xm0, xm1 ; H0 _ H1 _
579 vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
580 pshufb xm0, [hpred_shuf] ; H00224466 H11335577
581 psadbw m3, m5 ; s0 s1 s2 s3
582 vpermq m4, m3, q3312 ; s2 s1 s3 s3
583 vpermq m3, m3, q1310 ; s0 s1 s3 s1
586 pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
587 pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
588 vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
589 vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
590 vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
591 vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
592 movddup m2, [r0+FENC_STRIDE*0]
593 movddup m4, [r0+FENC_STRIDE*2]
594 pshuflw m3, m0, q0000
597 pshuflw m5, m0, q1111
602 movddup m4, [r0+FENC_STRIDE*4]
603 pshuflw m5, m0, q2222
608 movddup m4, [r0+FENC_STRIDE*6]
609 pshuflw m5, m0, q3333
614 vextracti128 xm0, m2, 1
615 vextracti128 xm1, m3, 1
616 paddw xm2, xm0 ; DC V
618 pextrd [r2+8], xm2, 2 ; V
620 movd [r2+0], xm2 ; DC
624 ;-----------------------------------------------------------------------------
625 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
626 ;-----------------------------------------------------------------------------
628 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
629 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
631 cglobal intra_sad_x3_16x16, 3,5,8
634 psadbw mm0, [r1-FDEC_STRIDE+0]
635 psadbw mm1, [r1-FDEC_STRIDE+8]
643 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
644 %if (x&3)==3 && x!=15
645 add r1, FDEC_STRIDE*4
650 sub r1, FDEC_STRIDE*12
655 mova m5, [r1-FDEC_STRIDE]
659 mova m1, [r1-FDEC_STRIDE+8]
665 mov r3d, 15*FENC_STRIDE
667 SPLATB_LOAD m6, r1+r3*2-1, m1
690 add r3d, -FENC_STRIDE
716 cglobal intra_sad_x3_16x16, 3,5,6
718 psadbw xm0, [r1-FDEC_STRIDE]
724 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
725 %if (x&3)==3 && x!=15
726 add r1, FDEC_STRIDE*4
731 sub r1, FDEC_STRIDE*12
735 vpbroadcastb xm5, xm5
736 vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
738 pxor m4, m4 ; DC / V accumulator
739 pxor xm3, xm3 ; H accumulator
740 mov r3d, 15*FENC_STRIDE
742 vpbroadcastb xm2, [r1+r3*2-1]
743 vbroadcasti128 m0, [r0+r3]
748 add r3d, -FENC_STRIDE
750 punpckhqdq m5, m4, m4
752 paddw m4, m5 ; DC / V
754 vextracti128 xm2, m4, 1
760 ;=============================================================================
762 ;=============================================================================
764 %macro SAD_X3_START_1x8P 0
787 %macro SAD_X3_START_2x4P 3
792 punpckldq mm3, [r0+FENC_STRIDE]
793 punpckldq %1, [r1+r4]
794 punpckldq %2, [r2+r4]
795 punpckldq %3, [r3+r4]
801 %macro SAD_X3_2x16P 1
808 SAD_X3_1x8P FENC_STRIDE, r4
809 SAD_X3_1x8P FENC_STRIDE+8, r4+8
810 add r0, 2*FENC_STRIDE
822 SAD_X3_1x8P FENC_STRIDE, r4
823 add r0, 2*FENC_STRIDE
831 SAD_X3_START_2x4P mm0, mm1, mm2
833 SAD_X3_START_2x4P mm4, mm5, mm6
838 add r0, 2*FENC_STRIDE
844 %macro SAD_X4_START_1x8P 0
871 %macro SAD_X4_START_2x4P 0
877 punpckldq mm7, [r0+FENC_STRIDE]
878 punpckldq mm0, [r1+r5]
879 punpckldq mm1, [r2+r5]
880 punpckldq mm2, [r3+r5]
881 punpckldq mm3, [r4+r5]
888 %macro SAD_X4_INC_2x4P 0
892 punpckldq mm7, [r0+FENC_STRIDE]
893 punpckldq mm4, [r1+r5]
894 punpckldq mm5, [r2+r5]
901 punpckldq mm4, [r3+r5]
902 punpckldq mm5, [r4+r5]
909 %macro SAD_X4_2x16P 1
916 SAD_X4_1x8P FENC_STRIDE, r5
917 SAD_X4_1x8P FENC_STRIDE+8, r5+8
918 add r0, 2*FENC_STRIDE
931 SAD_X4_1x8P FENC_STRIDE, r5
932 add r0, 2*FENC_STRIDE
945 add r0, 2*FENC_STRIDE
975 ;-----------------------------------------------------------------------------
976 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
977 ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
978 ;-----------------------------------------------------------------------------
980 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
1006 ;=============================================================================
1008 ;=============================================================================
1010 %macro SAD_X3_START_1x16P_SSE2 0
1026 %macro SAD_X3_1x16P_SSE2 2
1029 psadbw m4, m3, [r1+%2]
1030 psadbw m5, m3, [r2+%2]
1051 %macro SAD_X3_4x16P_SSE2 2
1054 SAD_X3_START_1x16P_SSE2
1056 SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
1058 SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
1059 SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
1060 SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
1063 add r0, 8*FENC_STRIDE
1071 %macro SAD_X3_START_2x8P_SSE2 0
1076 movhps m3, [r0+FENC_STRIDE]
1085 %macro SAD_X3_2x8P_SSE2 4
1102 %macro SAD_X4_START_2x8P_SSE2 0
1108 movhps m4, [r0+FENC_STRIDE]
1119 %macro SAD_X4_2x8P_SSE2 4
1140 %macro SAD_X4_START_1x16P_SSE2 0
1159 %macro SAD_X4_1x16P_SSE2 2
1162 psadbw m4, m6, [r1+%2]
1163 psadbw m5, m6, [r2+%2]
1173 psadbw m4, m6, [r3+%2]
1174 psadbw m5, m6, [r4+%2]
1185 %macro SAD_X4_4x16P_SSE2 2
1188 SAD_X4_START_1x16P_SSE2
1190 SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
1192 SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
1193 SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
1194 SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
1197 add r0, 8*FENC_STRIDE
1206 %macro SAD_X3_4x8P_SSE2 2
1209 SAD_X3_START_2x8P_SSE2
1211 SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
1213 SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
1216 add r0, 8*FENC_STRIDE
1224 %macro SAD_X4_4x8P_SSE2 2
1227 SAD_X4_START_2x8P_SSE2
1229 SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
1231 SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
1234 add r0, 8*FENC_STRIDE
1243 %macro SAD_X3_END_SSE2 0
1264 %macro SAD_X4_END_SSE2 0
1286 %macro SAD_X4_START_2x8P_SSSE3 0
1292 movddup m5, [r0+FENC_STRIDE]
1305 %macro SAD_X4_2x8P_SSSE3 4
1326 %macro SAD_X4_4x8P_SSSE3 2
1329 SAD_X4_START_2x8P_SSSE3
1331 SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
1333 SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
1336 add r0, 8*FENC_STRIDE
1345 %macro SAD_X4_END_SSSE3 0
1352 %macro SAD_X3_START_2x16P_AVX2 0
1353 movu m3, [r0] ; assumes FENC_STRIDE == 16
1357 vinserti128 m0, m0, [r1+r4], 1
1358 vinserti128 m1, m1, [r2+r4], 1
1359 vinserti128 m2, m2, [r3+r4], 1
1365 %macro SAD_X3_2x16P_AVX2 3
1366 movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
1370 vinserti128 m4, m4, [r1+%3], 1
1371 vinserti128 m5, m5, [r2+%3], 1
1372 vinserti128 m6, m6, [r3+%3], 1
1381 %macro SAD_X3_4x16P_AVX2 2
1384 SAD_X3_START_2x16P_AVX2
1386 SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
1388 SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
1391 add r0, 8*FENC_STRIDE
1399 %macro SAD_X4_START_2x16P_AVX2 0
1400 vbroadcasti128 m4, [r0]
1401 vbroadcasti128 m5, [r0+FENC_STRIDE]
1406 vinserti128 m0, m0, [r3], 1
1407 vinserti128 m1, m1, [r4], 1
1408 vinserti128 m2, m2, [r3+r5], 1
1409 vinserti128 m3, m3, [r4+r5], 1
1418 %macro SAD_X4_2x16P_AVX2 4
1419 vbroadcasti128 m6, [r0+%1]
1420 vbroadcasti128 m7, [r0+%3]
1425 vinserti128 m2, m2, [r3+%2], 1
1426 vinserti128 m3, m3, [r4+%2], 1
1427 vinserti128 m4, m4, [r3+%4], 1
1428 vinserti128 m5, m5, [r4+%4], 1
1439 %macro SAD_X4_4x16P_AVX2 2
1442 SAD_X4_START_2x16P_AVX2
1444 SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
1446 SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
1449 add r0, 8*FENC_STRIDE
1458 %macro SAD_X3_END_AVX2 0
1460 packssdw m0, m1 ; 0 0 1 1 0 0 1 1
1461 packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
1462 phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
1463 vextracti128 xm1, m0, 1
1464 paddd xm0, xm1 ; 0 1 2 _
1469 %macro SAD_X4_END_AVX2 0
1471 packssdw m0, m1 ; 0 0 1 1 2 2 3 3
1472 vextracti128 xm1, m0, 1
1473 phaddd xm0, xm1 ; 0 1 2 3
1478 ;-----------------------------------------------------------------------------
1479 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1480 ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
1481 ;-----------------------------------------------------------------------------
1483 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
1486 SAD_X%1_4x%2P_SSE2 x, %3/4
1493 SAD_X_SSE2 3, 16, 16, 7
1494 SAD_X_SSE2 3, 16, 8, 7
1495 SAD_X_SSE2 3, 8, 16, 7
1496 SAD_X_SSE2 3, 8, 8, 7
1497 SAD_X_SSE2 3, 8, 4, 7
1498 SAD_X_SSE2 4, 16, 16, 7
1499 SAD_X_SSE2 4, 16, 8, 7
1500 SAD_X_SSE2 4, 8, 16, 7
1501 SAD_X_SSE2 4, 8, 8, 7
1502 SAD_X_SSE2 4, 8, 4, 7
1505 SAD_X_SSE2 3, 16, 16, 7
1506 SAD_X_SSE2 3, 16, 8, 7
1507 SAD_X_SSE2 4, 16, 16, 7
1508 SAD_X_SSE2 4, 16, 8, 7
1510 %macro SAD_X_SSSE3 3
1511 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
1514 SAD_X%1_4x%2P_SSSE3 x, %3/4
1521 SAD_X_SSE2 3, 16, 16, 7
1522 SAD_X_SSE2 3, 16, 8, 7
1523 SAD_X_SSE2 4, 16, 16, 7
1524 SAD_X_SSE2 4, 16, 8, 7
1525 SAD_X_SSSE3 4, 8, 16
1530 SAD_X_SSE2 3, 16, 16, 6
1531 SAD_X_SSE2 3, 16, 8, 6
1532 SAD_X_SSE2 4, 16, 16, 7
1533 SAD_X_SSE2 4, 16, 8, 7
1536 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
1539 SAD_X%1_4x%2P_AVX2 x, %3/4
1546 SAD_X_AVX2 3, 16, 16, 7
1547 SAD_X_AVX2 3, 16, 8, 7
1548 SAD_X_AVX2 4, 16, 16, 8
1549 SAD_X_AVX2 4, 16, 8, 8
1551 ;=============================================================================
1552 ; SAD cacheline split
1553 ;=============================================================================
1555 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
1556 ; unless the unaligned data spans the border between 2 cachelines, in which
1557 ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
1558 ; to Nehalem have a large penalty for cacheline splits.
1559 ; (8-byte alignment exactly half way between two cachelines is ok though.)
1560 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
1561 ; So in the split case we load aligned data and explicitly perform the
1562 ; alignment between registers. Like on archs that have only aligned loads,
1563 ; except complicated by the fact that PALIGNR takes only an immediate, not
1564 ; a variable alignment.
1565 ; It is also possible to hoist the realignment to the macroblock level (keep
1566 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
1567 ; needed for that method makes it often slower.
1569 ; sad 16x16 costs on Core2:
1570 ; good offsets: 49 cycles (50/64 of all mvs)
1571 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
1572 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
1573 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
1575 ; computed jump assumes this loop is exactly 80 bytes
1576 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
1578 sad_w16_align%1_sse2:
1579 movdqa xmm1, [r2+16]
1580 movdqa xmm2, [r2+r3+16]
1582 movdqa xmm4, [r2+r3]
1590 psadbw xmm2, [r0+r1]
1596 jg sad_w16_align%1_sse2
1600 ; computed jump assumes this loop is exactly 64 bytes
1601 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
1603 sad_w16_align%1_ssse3:
1604 movdqa xmm1, [r2+16]
1605 movdqa xmm2, [r2+r3+16]
1606 palignr xmm1, [r2], %1
1607 palignr xmm2, [r2+r3], %1
1609 psadbw xmm2, [r0+r1]
1615 jg sad_w16_align%1_ssse3
1619 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
1620 cglobal pixel_sad_16x%2_cache64_%1
1624 jle pixel_sad_16x%2_sse2
1629 shl r4d, 6 ; code size = 64
1632 shl r4d, 4 ; code size = 80
1634 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
1636 lea r5, [sad_w16_addr]
1639 lea r5, [sad_w16_addr + r4]
1651 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
1653 and eax, 0x17|%1|(%4>>1)
1654 cmp eax, 0x10|%1|(%4>>1)
1655 jle pixel_sad_%1x%2_mmx2
1667 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1668 cglobal pixel_sad_16x%1_cache%2_mmx2
1669 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1693 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1694 cglobal pixel_sad_8x%1_cache%2_mmx2
1695 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1719 ; sad_x3/x4_cache64: check each mv.
1720 ; if they're all within a cacheline, use normal sad_x3/x4.
1721 ; otherwise, send them individually to sad_cache64.
1722 %macro CHECK_SPLIT 3 ; pix, width, cacheline
1724 and eax, 0x17|%2|(%3>>1)
1725 cmp eax, 0x10|%2|(%3>>1)
1729 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1730 cglobal pixel_sad_x3_%1x%2_cache%3_%6
1731 CHECK_SPLIT r1m, %1, %3
1732 CHECK_SPLIT r2m, %1, %3
1733 CHECK_SPLIT r3m, %1, %3
1734 jmp pixel_sad_x3_%1x%2_%4
1742 sub rsp, 40 ; shadow space and alignment
1749 call pixel_sad_%1x%2_cache%3_%5
1752 mov r2, [rsp+40+0*8]
1757 call pixel_sad_%1x%2_cache%3_%5
1760 mov r2, [rsp+40+1*8]
1765 call pixel_sad_%1x%2_cache%3_%5
1778 call pixel_sad_%1x%2_cache%3_%5
1782 call pixel_sad_%1x%2_cache%3_%5
1786 call pixel_sad_%1x%2_cache%3_%5
1794 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1795 cglobal pixel_sad_x4_%1x%2_cache%3_%6
1796 CHECK_SPLIT r1m, %1, %3
1797 CHECK_SPLIT r2m, %1, %3
1798 CHECK_SPLIT r3m, %1, %3
1799 CHECK_SPLIT r4m, %1, %3
1800 jmp pixel_sad_x4_%1x%2_%4
1809 sub rsp, 32 ; shadow space
1815 call pixel_sad_%1x%2_cache%3_%5
1818 mov r2, [rsp+32+0*8]
1823 call pixel_sad_%1x%2_cache%3_%5
1826 mov r2, [rsp+32+1*8]
1831 call pixel_sad_%1x%2_cache%3_%5
1834 mov r2, [rsp+32+2*8]
1839 call pixel_sad_%1x%2_cache%3_%5
1852 call pixel_sad_%1x%2_cache%3_%5
1856 call pixel_sad_%1x%2_cache%3_%5
1860 call pixel_sad_%1x%2_cache%3_%5
1864 call pixel_sad_%1x%2_cache%3_%5
1872 %macro SADX34_CACHELINE_FUNC 1+
1873 SADX3_CACHELINE_FUNC %1
1874 SADX4_CACHELINE_FUNC %1
1878 ; instantiate the aligned sads
1881 %if ARCH_X86_64 == 0
1882 SAD16_CACHELINE_FUNC_MMX2 8, 32
1883 SAD16_CACHELINE_FUNC_MMX2 16, 32
1884 SAD8_CACHELINE_FUNC_MMX2 4, 32
1885 SAD8_CACHELINE_FUNC_MMX2 8, 32
1886 SAD8_CACHELINE_FUNC_MMX2 16, 32
1887 SAD16_CACHELINE_FUNC_MMX2 8, 64
1888 SAD16_CACHELINE_FUNC_MMX2 16, 64
1889 %endif ; !ARCH_X86_64
1890 SAD8_CACHELINE_FUNC_MMX2 4, 64
1891 SAD8_CACHELINE_FUNC_MMX2 8, 64
1892 SAD8_CACHELINE_FUNC_MMX2 16, 64
1894 %if ARCH_X86_64 == 0
1895 SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
1896 SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
1897 SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
1898 SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
1899 SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
1900 SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
1901 %endif ; !ARCH_X86_64
1902 SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
1903 SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
1905 %if ARCH_X86_64 == 0
1906 SAD16_CACHELINE_FUNC sse2, 8
1907 SAD16_CACHELINE_FUNC sse2, 16
1910 SAD16_CACHELINE_LOOP_SSE2 i
1913 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
1914 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
1915 %endif ; !ARCH_X86_64
1916 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
1918 SAD16_CACHELINE_FUNC ssse3, 8
1919 SAD16_CACHELINE_FUNC ssse3, 16
1922 SAD16_CACHELINE_LOOP_SSSE3 i
1925 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
1926 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3