1 ;*****************************************************************************
2 ;* sad-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
34 ;=============================================================================
36 ;=============================================================================
38 %macro SAD_INC_2x16P 0
69 punpckldq mm1, [r0+r1]
70 punpckldq mm2, [r2+r3]
77 ;-----------------------------------------------------------------------------
78 ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
79 ;-----------------------------------------------------------------------------
81 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
100 ;=============================================================================
102 ;=============================================================================
104 %macro SAD_END_SSE2 0
112 ;-----------------------------------------------------------------------------
113 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
114 ;-----------------------------------------------------------------------------
115 cglobal x264_pixel_sad_16x16_%1, 4,4
179 ;-----------------------------------------------------------------------------
180 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
181 ;-----------------------------------------------------------------------------
182 cglobal x264_pixel_sad_16x8_%1, 4,4
221 %define movdqu movdqa
225 %macro SAD_INC_4x8P_SSE 1
248 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
249 cglobal x264_pixel_sad_8x16_sse2, 4,4
257 ;-----------------------------------------------------------------------------
258 ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
259 ;-----------------------------------------------------------------------------
261 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
262 ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
264 cglobal x264_intra_sad_x3_16x16_%1,3,5
267 psadbw mm0, [r1-FDEC_STRIDE+0]
268 psadbw mm1, [r1-FDEC_STRIDE+8]
272 mova m1, [pb_3 GLOBAL]
276 movzx r4d, byte [r1-1+FDEC_STRIDE*n]
284 mova m5, [r1-FDEC_STRIDE]
288 mova m1, [r1-FDEC_STRIDE+8]
294 mov r3d, 15*FENC_STRIDE
296 SPLATB m6, r1+r3*2-1, m1
319 add r3d, -FENC_STRIDE
338 %define SPLATB SPLATB_MMX
342 %define SPLATB SPLATB_SSSE3
347 ;=============================================================================
349 ;=============================================================================
351 %macro SAD_X3_START_1x8P 0
374 %macro SAD_X3_START_2x4P 3
379 punpckldq mm3, [r0+FENC_STRIDE]
380 punpckldq %1, [r1+r4]
381 punpckldq %2, [r2+r4]
382 punpckldq %3, [r3+r4]
388 %macro SAD_X3_2x16P 1
395 SAD_X3_1x8P FENC_STRIDE, r4
396 SAD_X3_1x8P FENC_STRIDE+8, r4+8
397 add r0, 2*FENC_STRIDE
409 SAD_X3_1x8P FENC_STRIDE, r4
410 add r0, 2*FENC_STRIDE
418 SAD_X3_START_2x4P mm0, mm1, mm2
420 SAD_X3_START_2x4P mm4, mm5, mm6
425 add r0, 2*FENC_STRIDE
431 %macro SAD_X4_START_1x8P 0
458 %macro SAD_X4_START_2x4P 0
464 punpckldq mm7, [r0+FENC_STRIDE]
465 punpckldq mm0, [r1+r5]
466 punpckldq mm1, [r2+r5]
467 punpckldq mm2, [r3+r5]
468 punpckldq mm3, [r4+r5]
475 %macro SAD_X4_INC_2x4P 0
479 punpckldq mm7, [r0+FENC_STRIDE]
480 punpckldq mm4, [r1+r5]
481 punpckldq mm5, [r2+r5]
488 punpckldq mm4, [r3+r5]
489 punpckldq mm5, [r4+r5]
496 %macro SAD_X4_2x16P 1
503 SAD_X4_1x8P FENC_STRIDE, r5
504 SAD_X4_1x8P FENC_STRIDE+8, r5+8
505 add r0, 2*FENC_STRIDE
518 SAD_X4_1x8P FENC_STRIDE, r5
519 add r0, 2*FENC_STRIDE
532 add r0, 2*FENC_STRIDE
562 ;-----------------------------------------------------------------------------
563 ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
564 ; uint8_t *pix2, int i_stride, int scores[3] )
565 ;-----------------------------------------------------------------------------
567 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
592 ;=============================================================================
594 ;=============================================================================
596 %macro SAD_X3_START_1x16P_SSE2 0
606 %macro SAD_X3_1x16P_SSE2 2
619 %macro SAD_X3_2x16P_SSE2 1
621 SAD_X3_START_1x16P_SSE2
623 SAD_X3_1x16P_SSE2 0, 0
625 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
626 add r0, 2*FENC_STRIDE
632 %macro SAD_X3_START_2x8P_SSE2 0
637 movhps xmm7, [r0+FENC_STRIDE]
646 %macro SAD_X3_2x8P_SSE2 0
651 movhps xmm7, [r0+FENC_STRIDE]
663 %macro SAD_X4_START_2x8P_SSE2 0
669 movhps xmm7, [r0+FENC_STRIDE]
680 %macro SAD_X4_2x8P_SSE2 0
687 movhps xmm7, [r0+FENC_STRIDE]
701 movhps xmm7, [r0+FENC_STRIDE]
719 %macro SAD_X4_START_1x16P_SSE2 0
731 %macro SAD_X4_1x16P_SSE2 2
759 %macro SAD_X4_2x16P_SSE2 1
761 SAD_X4_START_1x16P_SSE2
763 SAD_X4_1x16P_SSE2 0, 0
765 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
766 add r0, 2*FENC_STRIDE
773 %macro SAD_X3_2x8P_SSE2 1
775 SAD_X3_START_2x8P_SSE2
779 add r0, 2*FENC_STRIDE
785 %macro SAD_X4_2x8P_SSE2 1
787 SAD_X4_START_2x8P_SSE2
791 add r0, 2*FENC_STRIDE
798 %macro SAD_X3_END_SSE2 0
818 %macro SAD_X4_END_SSE2 0
833 ;-----------------------------------------------------------------------------
834 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
835 ; uint8_t *pix2, int i_stride, int scores[3] )
836 ;-----------------------------------------------------------------------------
838 cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
846 SAD_X_SSE2 3, 16, 16, sse2
847 SAD_X_SSE2 3, 16, 8, sse2
848 SAD_X_SSE2 3, 8, 16, sse2
849 SAD_X_SSE2 3, 8, 8, sse2
850 SAD_X_SSE2 3, 8, 4, sse2
851 SAD_X_SSE2 4, 16, 16, sse2
852 SAD_X_SSE2 4, 16, 8, sse2
853 SAD_X_SSE2 4, 8, 16, sse2
854 SAD_X_SSE2 4, 8, 8, sse2
855 SAD_X_SSE2 4, 8, 4, sse2
858 SAD_X_SSE2 3, 16, 16, sse3
859 SAD_X_SSE2 3, 16, 8, sse3
860 SAD_X_SSE2 4, 16, 16, sse3
861 SAD_X_SSE2 4, 16, 8, sse3
866 ;=============================================================================
867 ; SAD cacheline split
868 ;=============================================================================
870 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
871 ; unless the unaligned data spans the border between 2 cachelines, in which
872 ; case it's really slow. The exact numbers may differ, but all Intel cpus
873 ; have a large penalty for cacheline splits.
874 ; (8-byte alignment exactly half way between two cachelines is ok though.)
875 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
876 ; So in the split case we load aligned data and explicitly perform the
877 ; alignment between registers. Like on archs that have only aligned loads,
878 ; except complicated by the fact that PALIGNR takes only an immediate, not
879 ; a variable alignment.
880 ; It is also possible to hoist the realignment to the macroblock level (keep
881 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
882 ; needed for that method makes it often slower.
884 ; sad 16x16 costs on Core2:
885 ; good offsets: 49 cycles (50/64 of all mvs)
886 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
887 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
888 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
890 ; computed jump assumes this loop is exactly 80 bytes
891 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
893 sad_w16_align%1_sse2:
895 movdqa xmm2, [r2+r3+16]
911 jg sad_w16_align%1_sse2
915 ; computed jump assumes this loop is exactly 64 bytes
916 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
918 sad_w16_align%1_ssse3:
920 movdqa xmm2, [r2+r3+16]
921 palignr xmm1, [r2], %1
922 palignr xmm2, [r2+r3], %1
930 jg sad_w16_align%1_ssse3
934 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
935 cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
939 jle x264_pixel_sad_16x%2_sse2
944 shl r4d, 6 ; code size = 64
947 shl r4d, 4 ; code size = 80
949 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
951 lea r5, [sad_w16_addr GLOBAL]
954 lea r5, [sad_w16_addr + r4 GLOBAL]
966 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
968 and eax, 0x17|%1|(%4>>1)
969 cmp eax, 0x10|%1|(%4>>1)
970 jle x264_pixel_sad_%1x%2_mmxext
973 movd mm6, [sw_64 GLOBAL]
982 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
983 cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
984 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1008 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1009 cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
1010 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1034 ; sad_x3/x4_cache64: check each mv.
1035 ; if they're all within a cacheline, use normal sad_x3/x4.
1036 ; otherwise, send them individually to sad_cache64.
1037 %macro CHECK_SPLIT 3 ; pix, width, cacheline
1039 and eax, 0x17|%2|(%3>>1)
1040 cmp eax, 0x10|%2|(%3>>1)
1044 %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
1045 cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
1046 CHECK_SPLIT r1m, %1, %3
1047 CHECK_SPLIT r2m, %1, %3
1048 CHECK_SPLIT r3m, %1, %3
1049 jmp x264_pixel_sad_x3_%1x%2_%4
1059 call x264_pixel_sad_%1x%2_cache%3_%5
1063 call x264_pixel_sad_%1x%2_cache%3_%5
1067 call x264_pixel_sad_%1x%2_cache%3_%5
1076 call x264_pixel_sad_%1x%2_cache%3_%5
1080 call x264_pixel_sad_%1x%2_cache%3_%5
1084 call x264_pixel_sad_%1x%2_cache%3_%5
1092 %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
1093 cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
1094 CHECK_SPLIT r1m, %1, %3
1095 CHECK_SPLIT r2m, %1, %3
1096 CHECK_SPLIT r3m, %1, %3
1097 CHECK_SPLIT r4m, %1, %3
1098 jmp x264_pixel_sad_x4_%1x%2_%4
1109 call x264_pixel_sad_%1x%2_cache%3_%5
1113 call x264_pixel_sad_%1x%2_cache%3_%5
1117 call x264_pixel_sad_%1x%2_cache%3_%5
1121 call x264_pixel_sad_%1x%2_cache%3_%5
1130 call x264_pixel_sad_%1x%2_cache%3_%5
1134 call x264_pixel_sad_%1x%2_cache%3_%5
1138 call x264_pixel_sad_%1x%2_cache%3_%5
1142 call x264_pixel_sad_%1x%2_cache%3_%5
1150 %macro SADX34_CACHELINE_FUNC 5
1151 SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
1152 SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
1156 ; instantiate the aligned sads
1159 SAD16_CACHELINE_FUNC_MMX2 8, 32
1160 SAD16_CACHELINE_FUNC_MMX2 16, 32
1161 SAD8_CACHELINE_FUNC_MMX2 4, 32
1162 SAD8_CACHELINE_FUNC_MMX2 8, 32
1163 SAD8_CACHELINE_FUNC_MMX2 16, 32
1164 SAD16_CACHELINE_FUNC_MMX2 8, 64
1165 SAD16_CACHELINE_FUNC_MMX2 16, 64
1166 %endif ; !ARCH_X86_64
1167 SAD8_CACHELINE_FUNC_MMX2 4, 64
1168 SAD8_CACHELINE_FUNC_MMX2 8, 64
1169 SAD8_CACHELINE_FUNC_MMX2 16, 64
1172 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
1173 SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
1174 SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
1175 SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
1176 SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
1177 SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
1178 %endif ; !ARCH_X86_64
1179 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
1180 SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
1183 SAD16_CACHELINE_FUNC sse2, 8
1184 SAD16_CACHELINE_FUNC sse2, 16
1187 SAD16_CACHELINE_LOOP_SSE2 i
1190 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
1191 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
1192 %endif ; !ARCH_X86_64
1194 SAD16_CACHELINE_FUNC ssse3, 8
1195 SAD16_CACHELINE_FUNC ssse3, 16
1198 SAD16_CACHELINE_LOOP_SSSE3 i
1201 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
1202 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3