1 ;*****************************************************************************
2 ;* sad-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
34 ;=============================================================================
36 ;=============================================================================
38 %macro SAD_INC_2x16P 0
69 punpckldq mm1, [r0+r1]
70 punpckldq mm2, [r2+r3]
77 ;-----------------------------------------------------------------------------
78 ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
79 ;-----------------------------------------------------------------------------
81 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
100 ;=============================================================================
102 ;=============================================================================
104 %macro SAD_END_SSE2 0
112 ;-----------------------------------------------------------------------------
113 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
114 ;-----------------------------------------------------------------------------
115 cglobal x264_pixel_sad_16x16_%1, 4,4
179 ;-----------------------------------------------------------------------------
180 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
181 ;-----------------------------------------------------------------------------
182 cglobal x264_pixel_sad_16x8_%1, 4,4
220 %define movdqu movdqa
222 %define movdqu movups
226 ;-----------------------------------------------------------------------------
227 ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
228 ;-----------------------------------------------------------------------------
230 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
231 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
233 cglobal x264_intra_sad_x3_16x16_%1,3,5
236 psadbw mm0, [r1-FDEC_STRIDE+0]
237 psadbw mm1, [r1-FDEC_STRIDE+8]
241 mova m1, [pb_3 GLOBAL]
245 movzx r4d, byte [r1-1+FDEC_STRIDE*n]
253 mova m5, [r1-FDEC_STRIDE]
257 mova m1, [r1-FDEC_STRIDE+8]
263 mov r3d, 15*FENC_STRIDE
265 SPLATB m6, r1+r3*2-1, m1
288 add r3d, -FENC_STRIDE
307 %define SPLATB SPLATB_MMX
311 %define SPLATB SPLATB_SSSE3
316 ;=============================================================================
318 ;=============================================================================
320 %macro SAD_X3_START_1x8P 0
343 %macro SAD_X3_START_2x4P 3
348 punpckldq mm3, [r0+FENC_STRIDE]
349 punpckldq %1, [r1+r4]
350 punpckldq %2, [r2+r4]
351 punpckldq %3, [r3+r4]
357 %macro SAD_X3_2x16P 1
364 SAD_X3_1x8P FENC_STRIDE, r4
365 SAD_X3_1x8P FENC_STRIDE+8, r4+8
366 add r0, 2*FENC_STRIDE
378 SAD_X3_1x8P FENC_STRIDE, r4
379 add r0, 2*FENC_STRIDE
387 SAD_X3_START_2x4P mm0, mm1, mm2
389 SAD_X3_START_2x4P mm4, mm5, mm6
394 add r0, 2*FENC_STRIDE
400 %macro SAD_X4_START_1x8P 0
427 %macro SAD_X4_START_2x4P 0
433 punpckldq mm7, [r0+FENC_STRIDE]
434 punpckldq mm0, [r1+r5]
435 punpckldq mm1, [r2+r5]
436 punpckldq mm2, [r3+r5]
437 punpckldq mm3, [r4+r5]
444 %macro SAD_X4_INC_2x4P 0
448 punpckldq mm7, [r0+FENC_STRIDE]
449 punpckldq mm4, [r1+r5]
450 punpckldq mm5, [r2+r5]
457 punpckldq mm4, [r3+r5]
458 punpckldq mm5, [r4+r5]
465 %macro SAD_X4_2x16P 1
472 SAD_X4_1x8P FENC_STRIDE, r5
473 SAD_X4_1x8P FENC_STRIDE+8, r5+8
474 add r0, 2*FENC_STRIDE
487 SAD_X4_1x8P FENC_STRIDE, r5
488 add r0, 2*FENC_STRIDE
501 add r0, 2*FENC_STRIDE
531 ;-----------------------------------------------------------------------------
532 ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
533 ; uint8_t *pix2, int i_stride, int scores[3] )
534 ;-----------------------------------------------------------------------------
536 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
561 ;=============================================================================
563 ;=============================================================================
565 %macro SAD_X3_START_1x16P_SSE2 0
575 %macro SAD_X3_1x16P_SSE2 2
588 %macro SAD_X3_2x16P_SSE2 1
590 SAD_X3_START_1x16P_SSE2
592 SAD_X3_1x16P_SSE2 0, 0
594 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
595 add r0, 2*FENC_STRIDE
601 %macro SAD_X4_START_1x16P_SSE2 0
613 %macro SAD_X4_1x16P_SSE2 2
641 %macro SAD_X4_2x16P_SSE2 1
643 SAD_X4_START_1x16P_SSE2
645 SAD_X4_1x16P_SSE2 0, 0
647 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
648 add r0, 2*FENC_STRIDE
655 %macro SAD_X3_END_SSE2 0
675 %macro SAD_X4_END_SSE2 0
690 ;-----------------------------------------------------------------------------
691 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
692 ; uint8_t *pix2, int i_stride, int scores[3] )
693 ;-----------------------------------------------------------------------------
695 cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
703 SAD_X_SSE2 3, 16, 16, sse2
704 SAD_X_SSE2 3, 16, 8, sse2
705 SAD_X_SSE2 4, 16, 16, sse2
706 SAD_X_SSE2 4, 16, 8, sse2
709 SAD_X_SSE2 3, 16, 16, sse3
710 SAD_X_SSE2 3, 16, 8, sse3
711 SAD_X_SSE2 4, 16, 16, sse3
712 SAD_X_SSE2 4, 16, 8, sse3
717 ;=============================================================================
718 ; SAD cacheline split
719 ;=============================================================================
721 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
722 ; unless the unaligned data spans the border between 2 cachelines, in which
723 ; case it's really slow. The exact numbers may differ, but all Intel cpus
724 ; have a large penalty for cacheline splits.
725 ; (8-byte alignment exactly half way between two cachelines is ok though.)
726 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
727 ; So in the split case we load aligned data and explicitly perform the
728 ; alignment between registers. Like on archs that have only aligned loads,
729 ; except complicated by the fact that PALIGNR takes only an immediate, not
730 ; a variable alignment.
731 ; It is also possible to hoist the realignment to the macroblock level (keep
732 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
733 ; needed for that method makes it often slower.
735 ; sad 16x16 costs on Core2:
736 ; good offsets: 49 cycles (50/64 of all mvs)
737 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
738 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
739 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
741 ; computed jump assumes this loop is exactly 80 bytes
742 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
744 sad_w16_align%1_sse2:
746 movdqa xmm2, [r2+r3+16]
762 jg sad_w16_align%1_sse2
766 ; computed jump assumes this loop is exactly 64 bytes
767 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
769 sad_w16_align%1_ssse3:
771 movdqa xmm2, [r2+r3+16]
772 palignr xmm1, [r2], %1
773 palignr xmm2, [r2+r3], %1
781 jg sad_w16_align%1_ssse3
785 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
786 cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
790 jle x264_pixel_sad_16x%2_sse2
795 shl r4d, 6 ; code size = 64
798 shl r4d, 4 ; code size = 80
800 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
802 lea r5, [sad_w16_addr GLOBAL]
805 lea r5, [sad_w16_addr + r4 GLOBAL]
817 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
819 and eax, 0x17|%1|(%4>>1)
820 cmp eax, 0x10|%1|(%4>>1)
821 jle x264_pixel_sad_%1x%2_mmxext
824 movd mm6, [sw_64 GLOBAL]
833 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
834 cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
835 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
859 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
860 cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
861 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
885 ; sad_x3/x4_cache64: check each mv.
886 ; if they're all within a cacheline, use normal sad_x3/x4.
887 ; otherwise, send them individually to sad_cache64.
888 %macro CHECK_SPLIT 3 ; pix, width, cacheline
890 and eax, 0x17|%2|(%3>>1)
891 cmp eax, 0x10|%2|(%3>>1)
895 %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
896 cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
897 CHECK_SPLIT r1m, %1, %3
898 CHECK_SPLIT r2m, %1, %3
899 CHECK_SPLIT r3m, %1, %3
900 jmp x264_pixel_sad_x3_%1x%2_%4
910 call x264_pixel_sad_%1x%2_cache%3_%5
914 call x264_pixel_sad_%1x%2_cache%3_%5
918 call x264_pixel_sad_%1x%2_cache%3_%5
927 call x264_pixel_sad_%1x%2_cache%3_%5
931 call x264_pixel_sad_%1x%2_cache%3_%5
935 call x264_pixel_sad_%1x%2_cache%3_%5
943 %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
944 cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
945 CHECK_SPLIT r1m, %1, %3
946 CHECK_SPLIT r2m, %1, %3
947 CHECK_SPLIT r3m, %1, %3
948 CHECK_SPLIT r4m, %1, %3
949 jmp x264_pixel_sad_x4_%1x%2_%4
960 call x264_pixel_sad_%1x%2_cache%3_%5
964 call x264_pixel_sad_%1x%2_cache%3_%5
968 call x264_pixel_sad_%1x%2_cache%3_%5
972 call x264_pixel_sad_%1x%2_cache%3_%5
981 call x264_pixel_sad_%1x%2_cache%3_%5
985 call x264_pixel_sad_%1x%2_cache%3_%5
989 call x264_pixel_sad_%1x%2_cache%3_%5
993 call x264_pixel_sad_%1x%2_cache%3_%5
1001 %macro SADX34_CACHELINE_FUNC 5
1002 SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
1003 SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
1007 ; instantiate the aligned sads
1010 SAD16_CACHELINE_FUNC_MMX2 8, 32
1011 SAD16_CACHELINE_FUNC_MMX2 16, 32
1012 SAD8_CACHELINE_FUNC_MMX2 4, 32
1013 SAD8_CACHELINE_FUNC_MMX2 8, 32
1014 SAD8_CACHELINE_FUNC_MMX2 16, 32
1015 SAD16_CACHELINE_FUNC_MMX2 8, 64
1016 SAD16_CACHELINE_FUNC_MMX2 16, 64
1017 %endif ; !ARCH_X86_64
1018 SAD8_CACHELINE_FUNC_MMX2 4, 64
1019 SAD8_CACHELINE_FUNC_MMX2 8, 64
1020 SAD8_CACHELINE_FUNC_MMX2 16, 64
1023 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
1024 SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
1025 SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
1026 SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
1027 SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
1028 SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
1029 %endif ; !ARCH_X86_64
1030 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
1031 SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
1034 SAD16_CACHELINE_FUNC sse2, 8
1035 SAD16_CACHELINE_FUNC sse2, 16
1038 SAD16_CACHELINE_LOOP_SSE2 i
1041 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
1042 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
1043 %endif ; !ARCH_X86_64
1045 SAD16_CACHELINE_FUNC ssse3, 8
1046 SAD16_CACHELINE_FUNC ssse3, 16
1049 SAD16_CACHELINE_LOOP_SSSE3 i
1052 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
1053 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3