1 ;*****************************************************************************
2 ;* sad-a.asm: x86 sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2012 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
39 ;=============================================================================
41 ;=============================================================================
43 %macro SAD_INC_2x16P 0
74 punpckldq mm1, [r0+r1]
75 punpckldq mm2, [r2+r3]
82 ;-----------------------------------------------------------------------------
83 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
84 ;-----------------------------------------------------------------------------
86 cglobal pixel_sad_%1x%2_mmx2, 4,4
106 ;=============================================================================
108 ;=============================================================================
110 %macro SAD_END_SSE2 0
118 ;-----------------------------------------------------------------------------
119 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
120 ;-----------------------------------------------------------------------------
121 cglobal pixel_sad_16x16, 4,4,8
185 ;-----------------------------------------------------------------------------
186 ; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
187 ;-----------------------------------------------------------------------------
188 cglobal pixel_sad_16x8, 4,4
227 INIT_XMM sse2, aligned
230 %macro SAD_INC_4x8P_SSE 1
245 ACCUM paddw, 0, 1, %1
250 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
251 cglobal pixel_sad_8x16_sse2, 4,4
259 ;-----------------------------------------------------------------------------
260 ; void pixel_vsad( pixel *src, intptr_t stride );
261 ;-----------------------------------------------------------------------------
265 cglobal pixel_vsad_mmx2, 3,3
286 ;max sum: 31*16*255(pixel_max)=126480
301 cglobal pixel_vsad_sse2, 3,3
321 ;max sum: 31*16*255(pixel_max)=126480
326 ;-----------------------------------------------------------------------------
327 ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
328 ;-----------------------------------------------------------------------------
330 cglobal intra_sad_x3_4x4_mmx2, 3,3
332 movd mm0, [r1-FDEC_STRIDE]
333 movd mm1, [r0+FENC_STRIDE*0]
334 movd mm2, [r0+FENC_STRIDE*2]
336 punpckldq mm1, [r0+FENC_STRIDE*1]
337 punpckldq mm2, [r0+FENC_STRIDE*3]
343 movd [r2], mm0 ;V prediction cost
344 movd mm3, [r1+FDEC_STRIDE*0-4]
345 movd mm0, [r1+FDEC_STRIDE*1-4]
346 movd mm4, [r1+FDEC_STRIDE*2-4]
347 movd mm5, [r1+FDEC_STRIDE*3-4]
361 pshufw mm5, mm5, 0 ;DC prediction
369 movd [r2+8], mm5 ;DC prediction cost
370 movd [r2+4], mm1 ;H prediction cost
373 ;-----------------------------------------------------------------------------
374 ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
375 ;-----------------------------------------------------------------------------
386 %macro INTRA_SAD_HVDC_ITER 2
387 movq m5, [r0+FENC_STRIDE*%1]
390 ACCUM paddw, 1, 4, %1
393 ACCUM paddw, 2, 4, %1
396 ACCUM paddw, 3, 5, %1
400 cglobal intra_sad_x3_8x8_mmx2, 3,3
403 movq m6, [r1+16] ;V prediction
411 pshufw m0, m0, q0000 ;DC prediction
413 INTRA_SAD_HVDC_ITER 0, q3333
414 INTRA_SAD_HVDC_ITER 1, q2222
415 INTRA_SAD_HVDC_ITER 2, q1111
416 INTRA_SAD_HVDC_ITER 3, q0000
419 INTRA_SAD_HVDC_ITER 4, q3333
420 INTRA_SAD_HVDC_ITER 5, q2222
421 INTRA_SAD_HVDC_ITER 6, q1111
422 INTRA_SAD_HVDC_ITER 7, q0000
428 ;-----------------------------------------------------------------------------
429 ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
430 ;-----------------------------------------------------------------------------
432 %macro INTRA_SAD_HV_ITER 1
434 movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
435 movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
439 movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
440 movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
446 movq m4, [r0 + FENC_STRIDE*(%1+0)]
447 movq m5, [r0 + FENC_STRIDE*(%1+1)]
454 ACCUM paddw, 0, 1, %1
455 ACCUM paddw, 2, 4, %1
458 %macro INTRA_SAD_8x8C 0
459 cglobal intra_sad_x3_8x8c, 3,3
460 movq m6, [r1 - FDEC_STRIDE]
461 add r1, FDEC_STRIDE*4
472 movq m2, [r1 + FDEC_STRIDE*-4 - 8]
473 movq m4, [r1 + FDEC_STRIDE*-2 - 8]
474 movq m3, [r1 + FDEC_STRIDE* 0 - 8]
475 movq m5, [r1 + FDEC_STRIDE* 2 - 8]
476 punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
477 punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
478 punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
479 punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
494 punpckldq m0, m2 ;s0 s1 s2 s3
495 pshufw m3, m0, q3312 ;s2,s1,s3,s3
496 pshufw m0, m0, q1310 ;s0,s1,s3,s1
499 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
502 pshufb xmm0, [pb_shuf8x8c]
503 movq xmm1, [r0+FENC_STRIDE*0]
504 movq xmm2, [r0+FENC_STRIDE*1]
505 movq xmm3, [r0+FENC_STRIDE*2]
506 movq xmm4, [r0+FENC_STRIDE*3]
507 movhps xmm1, [r0+FENC_STRIDE*4]
508 movhps xmm2, [r0+FENC_STRIDE*5]
509 movhps xmm3, [r0+FENC_STRIDE*6]
510 movhps xmm4, [r0+FENC_STRIDE*7]
525 punpcklbw m0, m0 ; 4x dc0 4x dc1
526 punpckhbw m1, m1 ; 4x dc2 4x dc3
527 movq m2, [r0+FENC_STRIDE*0]
528 movq m3, [r0+FENC_STRIDE*1]
529 movq m4, [r0+FENC_STRIDE*2]
530 movq m5, [r0+FENC_STRIDE*3]
531 movq m6, [r0+FENC_STRIDE*4]
532 movq m7, [r0+FENC_STRIDE*5]
537 movq m0, [r0+FENC_STRIDE*6]
541 psadbw m1, [r0+FENC_STRIDE*7]
560 ;-----------------------------------------------------------------------------
561 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
562 ;-----------------------------------------------------------------------------
564 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
565 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
567 cglobal intra_sad_x3_16x16, 3,5,8
570 psadbw mm0, [r1-FDEC_STRIDE+0]
571 psadbw mm1, [r1-FDEC_STRIDE+8]
579 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
580 %if (x&3)==3 && x!=15
581 add r1, FDEC_STRIDE*4
586 sub r1, FDEC_STRIDE*12
591 mova m5, [r1-FDEC_STRIDE]
595 mova m1, [r1-FDEC_STRIDE+8]
601 mov r3d, 15*FENC_STRIDE
603 SPLATB_LOAD m6, r1+r3*2-1, m1
626 add r3d, -FENC_STRIDE
653 ;=============================================================================
655 ;=============================================================================
657 %macro SAD_X3_START_1x8P 0
680 %macro SAD_X3_START_2x4P 3
685 punpckldq mm3, [r0+FENC_STRIDE]
686 punpckldq %1, [r1+r4]
687 punpckldq %2, [r2+r4]
688 punpckldq %3, [r3+r4]
694 %macro SAD_X3_2x16P 1
701 SAD_X3_1x8P FENC_STRIDE, r4
702 SAD_X3_1x8P FENC_STRIDE+8, r4+8
703 add r0, 2*FENC_STRIDE
715 SAD_X3_1x8P FENC_STRIDE, r4
716 add r0, 2*FENC_STRIDE
724 SAD_X3_START_2x4P mm0, mm1, mm2
726 SAD_X3_START_2x4P mm4, mm5, mm6
731 add r0, 2*FENC_STRIDE
737 %macro SAD_X4_START_1x8P 0
764 %macro SAD_X4_START_2x4P 0
770 punpckldq mm7, [r0+FENC_STRIDE]
771 punpckldq mm0, [r1+r5]
772 punpckldq mm1, [r2+r5]
773 punpckldq mm2, [r3+r5]
774 punpckldq mm3, [r4+r5]
781 %macro SAD_X4_INC_2x4P 0
785 punpckldq mm7, [r0+FENC_STRIDE]
786 punpckldq mm4, [r1+r5]
787 punpckldq mm5, [r2+r5]
794 punpckldq mm4, [r3+r5]
795 punpckldq mm5, [r4+r5]
802 %macro SAD_X4_2x16P 1
809 SAD_X4_1x8P FENC_STRIDE, r5
810 SAD_X4_1x8P FENC_STRIDE+8, r5+8
811 add r0, 2*FENC_STRIDE
824 SAD_X4_1x8P FENC_STRIDE, r5
825 add r0, 2*FENC_STRIDE
838 add r0, 2*FENC_STRIDE
868 ;-----------------------------------------------------------------------------
869 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
870 ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
871 ;-----------------------------------------------------------------------------
873 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
899 ;=============================================================================
901 ;=============================================================================
903 %macro SAD_X3_START_1x16P_SSE2 0
904 %if cpuflag(misalign)
922 %macro SAD_X3_1x16P_SSE2 2
923 %if cpuflag(misalign)
947 %macro SAD_X3_2x16P_SSE2 1
949 SAD_X3_START_1x16P_SSE2
951 SAD_X3_1x16P_SSE2 0, 0
953 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
954 add r0, 2*FENC_STRIDE
960 %macro SAD_X3_START_2x8P_SSE2 0
965 movhps xmm7, [r0+FENC_STRIDE]
974 %macro SAD_X3_2x8P_SSE2 0
979 movhps xmm7, [r0+FENC_STRIDE]
991 %macro SAD_X4_START_2x8P_SSE2 0
997 movhps xmm7, [r0+FENC_STRIDE]
1000 movhps xmm2, [r3+r5]
1001 movhps xmm3, [r4+r5]
1008 %macro SAD_X4_2x8P_SSE2 0
1015 movhps xmm7, [r0+FENC_STRIDE]
1016 movhps xmm4, [r1+r5]
1017 movhps xmm5, [r2+r5]
1018 movhps xmm6, [r3+r5]
1019 movhps xmm8, [r4+r5]
1029 movhps xmm7, [r0+FENC_STRIDE]
1030 movhps xmm4, [r1+r5]
1031 movhps xmm5, [r2+r5]
1038 movhps xmm6, [r3+r5]
1039 movhps xmm4, [r4+r5]
1047 %macro SAD_X4_START_1x16P_SSE2 0
1048 %if cpuflag(misalign)
1070 %macro SAD_X4_1x16P_SSE2 2
1071 %if cpuflag(misalign)
1079 psadbw xmm7, [r4+%2]
1113 %macro SAD_X4_2x16P_SSE2 1
1115 SAD_X4_START_1x16P_SSE2
1117 SAD_X4_1x16P_SSE2 0, 0
1119 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
1120 add r0, 2*FENC_STRIDE
1127 %macro SAD_X3_2x8P_SSE2 1
1129 SAD_X3_START_2x8P_SSE2
1133 add r0, 2*FENC_STRIDE
1139 %macro SAD_X4_2x8P_SSE2 1
1141 SAD_X4_START_2x8P_SSE2
1145 add r0, 2*FENC_STRIDE
1152 %macro SAD_X3_END_SSE2 0
1172 %macro SAD_X4_END_SSE2 0
1187 ;-----------------------------------------------------------------------------
1188 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1189 ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
1190 ;-----------------------------------------------------------------------------
1192 cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
1193 SAD_X%1_2x%2P_SSE2 1
1195 SAD_X%1_2x%2P_SSE2 0
1201 SAD_X_SSE2 3, 16, 16
1206 SAD_X_SSE2 4, 16, 16
1212 INIT_XMM sse2, misalign
1213 SAD_X_SSE2 3, 16, 16
1215 SAD_X_SSE2 4, 16, 16
1219 SAD_X_SSE2 3, 16, 16
1221 SAD_X_SSE2 4, 16, 16
1226 ;=============================================================================
1227 ; SAD cacheline split
1228 ;=============================================================================
1230 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
1231 ; unless the unaligned data spans the border between 2 cachelines, in which
1232 ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
1233 ; to Nehalem have a large penalty for cacheline splits.
1234 ; (8-byte alignment exactly half way between two cachelines is ok though.)
1235 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
1236 ; So in the split case we load aligned data and explicitly perform the
1237 ; alignment between registers. Like on archs that have only aligned loads,
1238 ; except complicated by the fact that PALIGNR takes only an immediate, not
1239 ; a variable alignment.
1240 ; It is also possible to hoist the realignment to the macroblock level (keep
1241 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
1242 ; needed for that method makes it often slower.
1244 ; sad 16x16 costs on Core2:
1245 ; good offsets: 49 cycles (50/64 of all mvs)
1246 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
1247 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
1248 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
1250 ; computed jump assumes this loop is exactly 80 bytes
1251 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
1253 sad_w16_align%1_sse2:
1254 movdqa xmm1, [r2+16]
1255 movdqa xmm2, [r2+r3+16]
1257 movdqa xmm4, [r2+r3]
1265 psadbw xmm2, [r0+r1]
1271 jg sad_w16_align%1_sse2
1275 ; computed jump assumes this loop is exactly 64 bytes
1276 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
1278 sad_w16_align%1_ssse3:
1279 movdqa xmm1, [r2+16]
1280 movdqa xmm2, [r2+r3+16]
1281 palignr xmm1, [r2], %1
1282 palignr xmm2, [r2+r3], %1
1284 psadbw xmm2, [r0+r1]
1290 jg sad_w16_align%1_ssse3
1294 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
1295 cglobal pixel_sad_16x%2_cache64_%1
1299 jle pixel_sad_16x%2_sse2
1304 shl r4d, 6 ; code size = 64
1307 shl r4d, 4 ; code size = 80
1309 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
1311 lea r5, [sad_w16_addr]
1314 lea r5, [sad_w16_addr + r4]
1326 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
1328 and eax, 0x17|%1|(%4>>1)
1329 cmp eax, 0x10|%1|(%4>>1)
1330 jle pixel_sad_%1x%2_mmx2
1342 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1343 cglobal pixel_sad_16x%1_cache%2_mmx2
1344 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1368 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1369 cglobal pixel_sad_8x%1_cache%2_mmx2
1370 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1394 ; sad_x3/x4_cache64: check each mv.
1395 ; if they're all within a cacheline, use normal sad_x3/x4.
1396 ; otherwise, send them individually to sad_cache64.
1397 %macro CHECK_SPLIT 3 ; pix, width, cacheline
1399 and eax, 0x17|%2|(%3>>1)
1400 cmp eax, 0x10|%2|(%3>>1)
1404 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1405 cglobal pixel_sad_x3_%1x%2_cache%3_%6
1406 CHECK_SPLIT r1m, %1, %3
1407 CHECK_SPLIT r2m, %1, %3
1408 CHECK_SPLIT r3m, %1, %3
1409 jmp pixel_sad_x3_%1x%2_%4
1424 call pixel_sad_%1x%2_cache%3_%5
1432 call pixel_sad_%1x%2_cache%3_%5
1440 call pixel_sad_%1x%2_cache%3_%5
1453 call pixel_sad_%1x%2_cache%3_%5
1457 call pixel_sad_%1x%2_cache%3_%5
1461 call pixel_sad_%1x%2_cache%3_%5
1469 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1470 cglobal pixel_sad_x4_%1x%2_cache%3_%6
1471 CHECK_SPLIT r1m, %1, %3
1472 CHECK_SPLIT r2m, %1, %3
1473 CHECK_SPLIT r3m, %1, %3
1474 CHECK_SPLIT r4m, %1, %3
1475 jmp pixel_sad_x4_%1x%2_%4
1487 call pixel_sad_%1x%2_cache%3_%5
1495 call pixel_sad_%1x%2_cache%3_%5
1503 call pixel_sad_%1x%2_cache%3_%5
1511 call pixel_sad_%1x%2_cache%3_%5
1524 call pixel_sad_%1x%2_cache%3_%5
1528 call pixel_sad_%1x%2_cache%3_%5
1532 call pixel_sad_%1x%2_cache%3_%5
1536 call pixel_sad_%1x%2_cache%3_%5
1544 %macro SADX34_CACHELINE_FUNC 1+
1545 SADX3_CACHELINE_FUNC %1
1546 SADX4_CACHELINE_FUNC %1
1550 ; instantiate the aligned sads
1553 %if ARCH_X86_64 == 0
1554 SAD16_CACHELINE_FUNC_MMX2 8, 32
1555 SAD16_CACHELINE_FUNC_MMX2 16, 32
1556 SAD8_CACHELINE_FUNC_MMX2 4, 32
1557 SAD8_CACHELINE_FUNC_MMX2 8, 32
1558 SAD8_CACHELINE_FUNC_MMX2 16, 32
1559 SAD16_CACHELINE_FUNC_MMX2 8, 64
1560 SAD16_CACHELINE_FUNC_MMX2 16, 64
1561 %endif ; !ARCH_X86_64
1562 SAD8_CACHELINE_FUNC_MMX2 4, 64
1563 SAD8_CACHELINE_FUNC_MMX2 8, 64
1564 SAD8_CACHELINE_FUNC_MMX2 16, 64
1566 %if ARCH_X86_64 == 0
1567 SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
1568 SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
1569 SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
1570 SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
1571 SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
1572 SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
1573 %endif ; !ARCH_X86_64
1574 SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
1575 SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
1577 %if ARCH_X86_64 == 0
1578 SAD16_CACHELINE_FUNC sse2, 8
1579 SAD16_CACHELINE_FUNC sse2, 16
1582 SAD16_CACHELINE_LOOP_SSE2 i
1585 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
1586 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
1587 %endif ; !ARCH_X86_64
1588 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
1590 SAD16_CACHELINE_FUNC ssse3, 8
1591 SAD16_CACHELINE_FUNC ssse3, 16
1594 SAD16_CACHELINE_LOOP_SSSE3 i
1597 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
1598 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3