;***************************************************************************** ;* sad-a.asm: h264 encoder library ;***************************************************************************** ;* Copyright (C) 2003-2008 x264 project ;* ;* Authors: Loren Merritt ;* Laurent Aimar ;* Alex Izvorski ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA pb_3: times 16 db 3 sw_64: dd 64 SECTION .text ;============================================================================= ; SAD MMX ;============================================================================= %macro SAD_INC_2x16P 0 movq mm1, [r0] movq mm2, [r0+8] movq mm3, [r0+r1] movq mm4, [r0+r1+8] psadbw mm1, [r2] psadbw mm2, [r2+8] psadbw mm3, [r2+r3] psadbw mm4, [r2+r3+8] lea r0, [r0+2*r1] paddw mm1, mm2 paddw mm3, mm4 lea r2, [r2+2*r3] paddw mm0, mm1 paddw mm0, mm3 %endmacro %macro SAD_INC_2x8P 0 movq mm1, [r0] movq mm2, [r0+r1] psadbw mm1, [r2] psadbw mm2, [r2+r3] lea r0, [r0+2*r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] %endmacro %macro SAD_INC_2x4P 0 movd mm1, [r0] movd mm2, [r2] punpckldq mm1, [r0+r1] punpckldq mm2, [r2+r3] psadbw mm1, mm2 paddw mm0, mm1 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SAD 2 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4 pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P %endrep movd eax, mm0 RET %endmacro SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 SAD 4, 8 SAD 4, 4 ;============================================================================= ; SAD XMM ;============================================================================= %macro SAD_END_SSE2 0 movhlps m1, m0 paddw m0, m1 movd eax, m0 RET %endmacro %macro SAD_W16 1 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- cglobal x264_pixel_sad_16x16_%1, 4,4 movdqu m0, [r2] movdqu m1, [r2+r3] lea r2, [r2+2*r3] movdqu m2, [r2] movdqu m3, [r2+r3] lea r2, [r2+2*r3] psadbw m0, [r0] psadbw m1, [r0+r1] lea r0, [r0+2*r1] movdqu m4, [r2] paddw m0, m1 psadbw m2, [r0] psadbw m3, [r0+r1] lea r0, [r0+2*r1] movdqu m5, [r2+r3] lea r2, [r2+2*r3] paddw m2, m3 movdqu m6, [r2] movdqu m7, [r2+r3] lea r2, [r2+2*r3] paddw m0, m2 psadbw m4, [r0] psadbw m5, [r0+r1] lea r0, [r0+2*r1] movdqu m1, [r2] paddw m4, m5 psadbw m6, [r0] psadbw m7, [r0+r1] lea r0, [r0+2*r1] movdqu m2, [r2+r3] lea r2, [r2+2*r3] paddw m6, m7 movdqu m3, [r2] paddw m0, m4 movdqu m4, [r2+r3] lea r2, [r2+2*r3] paddw m0, m6 psadbw m1, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] movdqu m5, [r2] paddw m1, m2 psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] movdqu m6, [r2+r3] lea r2, [r2+2*r3] paddw m3, m4 movdqu m7, [r2] paddw m0, m1 movdqu m1, [r2+r3] paddw m0, m3 psadbw m5, [r0] psadbw m6, [r0+r1] lea r0, [r0+2*r1] paddw m5, m6 psadbw m7, [r0] psadbw m1, [r0+r1] paddw m7, m1 paddw m0, m5 paddw m0, m7 SAD_END_SSE2 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- cglobal x264_pixel_sad_16x8_%1, 4,4 movdqu m0, [r2] movdqu m2, [r2+r3] lea r2, [r2+2*r3] movdqu m3, [r2] movdqu m4, [r2+r3] psadbw m0, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m0, m2 paddw m3, m4 paddw m0, m3 movdqu m1, [r2] movdqu m2, [r2+r3] lea r2, [r2+2*r3] movdqu m3, [r2] movdqu m4, [r2+r3] psadbw m1, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m1, m2 paddw m3, m4 paddw m0, m1 paddw m0, m3 SAD_END_SSE2 %endmacro INIT_XMM SAD_W16 sse2 %define movdqu lddqu SAD_W16 sse3 %define movdqu movdqa SAD_W16 sse2_aligned %undef movdqu %macro SAD_INC_4x8P_SSE 1 movq m1, [r0] movq m2, [r0+r1] lea r0, [r0+2*r1] movq m3, [r2] movq m4, [r2+r3] lea r2, [r2+2*r3] movhps m1, [r0] movhps m2, [r0+r1] movhps m3, [r2] movhps m4, [r2+r3] lea r0, [r0+2*r1] psadbw m1, m3 psadbw m2, m4 lea r2, [r2+2*r3] %if %1 paddw m0, m1 %else SWAP m0, m1 %endif paddw m0, m2 %endmacro ;Even on Nehalem, no sizes other than 8x16 benefit from this method. cglobal x264_pixel_sad_8x16_sse2, 4,4 SAD_INC_4x8P_SSE 0 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 RET ;----------------------------------------------------------------------------- ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score %macro INTRA_SAD16 1 cglobal x264_intra_sad_x3_16x16_%1,3,5 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1-FDEC_STRIDE+0] psadbw mm1, [r1-FDEC_STRIDE+8] paddw mm0, mm1 movd r3d, mm0 %ifidn %1, ssse3 mova m1, [pb_3 GLOBAL] %endif %assign n 0 %rep 16 movzx r4d, byte [r1-1+FDEC_STRIDE*n] add r3d, r4d %assign n n+1 %endrep add r3d, 16 shr r3d, 5 imul r3d, 0x01010101 movd m7, r3d mova m5, [r1-FDEC_STRIDE] %if mmsize==16 pshufd m7, m7, 0 %else mova m1, [r1-FDEC_STRIDE+8] punpckldq m7, m7 %endif pxor m4, m4 pxor m3, m3 pxor m2, m2 mov r3d, 15*FENC_STRIDE .vloop: SPLATB m6, r1+r3*2-1, m1 mova m0, [r0+r3] psadbw m0, m7 paddw m4, m0 mova m0, [r0+r3] psadbw m0, m5 paddw m2, m0 %if mmsize==8 mova m0, [r0+r3] psadbw m0, m6 paddw m3, m0 mova m0, [r0+r3+8] psadbw m0, m7 paddw m4, m0 mova m0, [r0+r3+8] psadbw m0, m1 paddw m2, m0 psadbw m6, [r0+r3+8] paddw m3, m6 %else psadbw m6, [r0+r3] paddw m3, m6 %endif add r3d, -FENC_STRIDE jge .vloop %if mmsize==16 pslldq m3, 4 por m3, m2 movhlps m1, m3 paddw m3, m1 movq [r2+0], m3 movhlps m1, m4 paddw m4, m1 %else movd [r2+0], m2 movd [r2+4], m3 %endif movd [r2+8], m4 RET %endmacro INIT_MMX %define SPLATB SPLATB_MMX INTRA_SAD16 mmxext INIT_XMM INTRA_SAD16 sse2 %define SPLATB SPLATB_SSSE3 INTRA_SAD16 ssse3 ;============================================================================= ; SAD x3/x4 MMX ;============================================================================= %macro SAD_X3_START_1x8P 0 movq mm3, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] psadbw mm0, mm3 psadbw mm1, mm3 psadbw mm2, mm3 %endmacro %macro SAD_X3_1x8P 2 movq mm3, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm3 psadbw mm5, mm3 psadbw mm6, mm3 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endmacro %macro SAD_X3_START_2x4P 3 movd mm3, [r0] movd %1, [r1] movd %2, [r2] movd %3, [r3] punpckldq mm3, [r0+FENC_STRIDE] punpckldq %1, [r1+r4] punpckldq %2, [r2+r4] punpckldq %3, [r3+r4] psadbw %1, mm3 psadbw %2, mm3 psadbw %3, mm3 %endmacro %macro SAD_X3_2x16P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P 8, 8 SAD_X3_1x8P FENC_STRIDE, r4 SAD_X3_1x8P FENC_STRIDE+8, r4+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_2x8P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_2x4P 1 %if %1 SAD_X3_START_2x4P mm0, mm1, mm2 %else SAD_X3_START_2x4P mm4, mm5, mm6 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X4_START_1x8P 0 movq mm7, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] movq mm3, [r4] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_1x8P 2 movq mm7, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm7 psadbw mm5, mm7 psadbw mm6, mm7 psadbw mm7, [r4+%2] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7 %endmacro %macro SAD_X4_START_2x4P 0 movd mm7, [r0] movd mm0, [r1] movd mm1, [r2] movd mm2, [r3] movd mm3, [r4] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm0, [r1+r5] punpckldq mm1, [r2+r5] punpckldq mm2, [r3+r5] punpckldq mm3, [r4+r5] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_INC_2x4P 0 movd mm7, [r0] movd mm4, [r1] movd mm5, [r2] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm4, [r1+r5] punpckldq mm5, [r2+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm0, mm4 paddw mm1, mm5 movd mm4, [r3] movd mm5, [r4] punpckldq mm4, [r3+r5] punpckldq mm5, [r4+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm2, mm4 paddw mm3, mm5 %endmacro %macro SAD_X4_2x16P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P 8, 8 SAD_X4_1x8P FENC_STRIDE, r5 SAD_X4_1x8P FENC_STRIDE+8, r5+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X4_2x8P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X4_2x4P 1 %if %1 SAD_X4_START_2x4P %else SAD_X4_INC_2x4P %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X3_END 0 %ifdef ARCH_X86_64 movd [r5+0], mm0 movd [r5+4], mm1 movd [r5+8], mm2 %else mov r0, r5m movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 %endif RET %endmacro %macro SAD_X4_END 0 mov r0, r6m movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 movd [r0+12], mm3 RET %endmacro ;----------------------------------------------------------------------------- ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 %endrep SAD_X%1_END %endmacro SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 SAD_X 3, 4, 8 SAD_X 3, 4, 4 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 SAD_X 4, 4, 8 SAD_X 4, 4, 4 ;============================================================================= ; SAD x3/x4 XMM ;============================================================================= %macro SAD_X3_START_1x16P_SSE2 0 movdqa xmm3, [r0] movdqu xmm0, [r1] movdqu xmm1, [r2] movdqu xmm2, [r3] psadbw xmm0, xmm3 psadbw xmm1, xmm3 psadbw xmm2, xmm3 %endmacro %macro SAD_X3_1x16P_SSE2 2 movdqa xmm3, [r0+%1] movdqu xmm4, [r1+%2] movdqu xmm5, [r2+%2] movdqu xmm6, [r3+%2] psadbw xmm4, xmm3 psadbw xmm5, xmm3 psadbw xmm6, xmm3 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 %endmacro %macro SAD_X3_2x16P_SSE2 1 %if %1 SAD_X3_START_1x16P_SSE2 %else SAD_X3_1x16P_SSE2 0, 0 %endif SAD_X3_1x16P_SSE2 FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_START_2x8P_SSE2 0 movq xmm7, [r0] movq xmm0, [r1] movq xmm1, [r2] movq xmm2, [r3] movhps xmm7, [r0+FENC_STRIDE] movhps xmm0, [r1+r4] movhps xmm1, [r2+r4] movhps xmm2, [r3+r4] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 %endmacro %macro SAD_X3_2x8P_SSE2 0 movq xmm7, [r0] movq xmm3, [r1] movq xmm4, [r2] movq xmm5, [r3] movhps xmm7, [r0+FENC_STRIDE] movhps xmm3, [r1+r4] movhps xmm4, [r2+r4] movhps xmm5, [r3+r4] psadbw xmm3, xmm7 psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm3 paddw xmm1, xmm4 paddw xmm2, xmm5 %endmacro %macro SAD_X4_START_2x8P_SSE2 0 movq xmm7, [r0] movq xmm0, [r1] movq xmm1, [r2] movq xmm2, [r3] movq xmm3, [r4] movhps xmm7, [r0+FENC_STRIDE] movhps xmm0, [r1+r5] movhps xmm1, [r2+r5] movhps xmm2, [r3+r5] movhps xmm3, [r4+r5] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 psadbw xmm3, xmm7 %endmacro %macro SAD_X4_2x8P_SSE2 0 movq xmm7, [r0] movq xmm4, [r1] movq xmm5, [r2] %ifdef ARCH_X86_64 movq xmm6, [r3] movq xmm8, [r4] movhps xmm7, [r0+FENC_STRIDE] movhps xmm4, [r1+r5] movhps xmm5, [r2+r5] movhps xmm6, [r3+r5] movhps xmm8, [r4+r5] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 psadbw xmm8, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm8 %else movhps xmm7, [r0+FENC_STRIDE] movhps xmm4, [r1+r5] movhps xmm5, [r2+r5] psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 movq xmm6, [r3] movq xmm4, [r4] movhps xmm6, [r3+r5] movhps xmm4, [r4+r5] psadbw xmm6, xmm7 psadbw xmm4, xmm7 paddw xmm2, xmm6 paddw xmm3, xmm4 %endif %endmacro %macro SAD_X4_START_1x16P_SSE2 0 movdqa xmm7, [r0] movdqu xmm0, [r1] movdqu xmm1, [r2] movdqu xmm2, [r3] movdqu xmm3, [r4] psadbw xmm0, xmm7 psadbw xmm1, xmm7 psadbw xmm2, xmm7 psadbw xmm3, xmm7 %endmacro %macro SAD_X4_1x16P_SSE2 2 movdqa xmm7, [r0+%1] movdqu xmm4, [r1+%2] movdqu xmm5, [r2+%2] movdqu xmm6, [r3+%2] %ifdef ARCH_X86_64 movdqu xmm8, [r4+%2] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 psadbw xmm8, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm8 %else psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm4 psadbw xmm6, xmm7 movdqu xmm4, [r4+%2] paddw xmm1, xmm5 psadbw xmm4, xmm7 paddw xmm2, xmm6 paddw xmm3, xmm4 %endif %endmacro %macro SAD_X4_2x16P_SSE2 1 %if %1 SAD_X4_START_1x16P_SSE2 %else SAD_X4_1x16P_SSE2 0, 0 %endif SAD_X4_1x16P_SSE2 FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X3_2x8P_SSE2 1 %if %1 SAD_X3_START_2x8P_SSE2 %else SAD_X3_2x8P_SSE2 %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X4_2x8P_SSE2 1 %if %1 SAD_X4_START_2x8P_SSE2 %else SAD_X4_2x8P_SSE2 %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X3_END_SSE2 0 movhlps xmm4, xmm0 movhlps xmm5, xmm1 movhlps xmm6, xmm2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 %ifdef ARCH_X86_64 movd [r5+0], xmm0 movd [r5+4], xmm1 movd [r5+8], xmm2 %else mov r0, r5m movd [r0+0], xmm0 movd [r0+4], xmm1 movd [r0+8], xmm2 %endif RET %endmacro %macro SAD_X4_END_SSE2 0 mov r0, r6m psllq xmm1, 32 psllq xmm3, 32 paddw xmm0, xmm1 paddw xmm2, xmm3 movhlps xmm1, xmm0 movhlps xmm3, xmm2 paddw xmm0, xmm1 paddw xmm2, xmm3 movq [r0+0], xmm0 movq [r0+8], xmm2 RET %endmacro ;----------------------------------------------------------------------------- ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 4 cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1 SAD_X%1_2x%2P_SSE2 1 %rep %3/2-1 SAD_X%1_2x%2P_SSE2 0 %endrep SAD_X%1_END_SSE2 %endmacro SAD_X_SSE2 3, 16, 16, sse2 SAD_X_SSE2 3, 16, 8, sse2 SAD_X_SSE2 3, 8, 16, sse2 SAD_X_SSE2 3, 8, 8, sse2 SAD_X_SSE2 3, 8, 4, sse2 SAD_X_SSE2 4, 16, 16, sse2 SAD_X_SSE2 4, 16, 8, sse2 SAD_X_SSE2 4, 8, 16, sse2 SAD_X_SSE2 4, 8, 8, sse2 SAD_X_SSE2 4, 8, 4, sse2 %define movdqu lddqu SAD_X_SSE2 3, 16, 16, sse3 SAD_X_SSE2 3, 16, 8, sse3 SAD_X_SSE2 4, 16, 16, sse3 SAD_X_SSE2 4, 16, 8, sse3 %undef movdqu ;============================================================================= ; SAD cacheline split ;============================================================================= ; Core2 (Conroe) can load unaligned data just as quickly as aligned data... ; unless the unaligned data spans the border between 2 cachelines, in which ; case it's really slow. The exact numbers may differ, but all Intel cpus ; have a large penalty for cacheline splits. ; (8-byte alignment exactly half way between two cachelines is ok though.) ; LDDQU was supposed to fix this, but it only works on Pentium 4. ; So in the split case we load aligned data and explicitly perform the ; alignment between registers. Like on archs that have only aligned loads, ; except complicated by the fact that PALIGNR takes only an immediate, not ; a variable alignment. ; It is also possible to hoist the realignment to the macroblock level (keep ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory ; needed for that method makes it often slower. ; sad 16x16 costs on Core2: ; good offsets: 49 cycles (50/64 of all mvs) ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) ; computed jump assumes this loop is exactly 80 bytes %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment ALIGN 16 sad_w16_align%1_sse2: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] movdqa xmm3, [r2] movdqa xmm4, [r2+r3] pslldq xmm1, 16-%1 pslldq xmm2, 16-%1 psrldq xmm3, %1 psrldq xmm4, %1 por xmm1, xmm3 por xmm2, xmm4 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_sse2 ret %endmacro ; computed jump assumes this loop is exactly 64 bytes %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment ALIGN 16 sad_w16_align%1_ssse3: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] palignr xmm1, [r2], %1 palignr xmm2, [r2+r3], %1 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_ssse3 ret %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0 mov eax, r2m and eax, 0x37 cmp eax, 0x30 jle x264_pixel_sad_16x%2_sse2 PROLOGUE 4,6 mov r4d, r2d and r4d, 15 %ifidn %1, ssse3 shl r4d, 6 ; code size = 64 %else lea r4, [r4*5] shl r4d, 4 ; code size = 80 %endif %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) %ifdef PIC lea r5, [sad_w16_addr GLOBAL] add r5, r4 %else lea r5, [sad_w16_addr + r4 GLOBAL] %endif and r2, ~15 mov r4d, %2/2 pxor xmm0, xmm0 call r5 movhlps xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 RET %endmacro %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline mov eax, r2m and eax, 0x17|%1|(%4>>1) cmp eax, 0x10|%1|(%4>>1) jle x264_pixel_sad_%1x%2_mmxext and eax, 7 shl eax, 3 movd mm6, [sw_64 GLOBAL] movd mm7, eax psubw mm6, mm7 PROLOGUE 4,5 and r2, ~7 mov r4d, %3 pxor mm0, mm0 %endmacro %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0 SAD_CACHELINE_START_MMX2 16, %1, %1, %2 .loop: movq mm1, [r2] movq mm2, [r2+8] movq mm3, [r2+16] movq mm4, mm2 psrlq mm1, mm7 psllq mm2, mm6 psllq mm3, mm6 psrlq mm4, mm7 por mm1, mm2 por mm3, mm4 psadbw mm1, [r0] psadbw mm3, [r0+8] paddw mm0, mm1 paddw mm0, mm3 add r2, r3 add r0, r1 dec r4 jg .loop movd eax, mm0 RET %endmacro %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 .loop: movq mm1, [r2+8] movq mm2, [r2+r3+8] movq mm3, [r2] movq mm4, [r2+r3] psllq mm1, mm6 psllq mm2, mm6 psrlq mm3, mm7 psrlq mm4, mm7 por mm1, mm3 por mm2, mm4 psadbw mm1, [r0] psadbw mm2, [r0+r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] lea r0, [r0+2*r1] dec r4 jg .loop movd eax, mm0 RET %endmacro ; sad_x3/x4_cache64: check each mv. ; if they're all within a cacheline, use normal sad_x3/x4. ; otherwise, send them individually to sad_cache64. %macro CHECK_SPLIT 3 ; pix, width, cacheline mov eax, %1 and eax, 0x17|%2|(%3>>1) cmp eax, 0x10|%2|(%3>>1) jg .split %endmacro %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 jmp x264_pixel_sad_x3_%1x%2_%4 .split: %ifdef ARCH_X86_64 push r3 push r2 mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 mov r10, r0 mov r11, r5 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax pop r2 mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax pop r2 mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax %else push edi mov edi, [esp+28] push dword [esp+24] push dword [esp+16] push dword 16 push dword [esp+20] call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov [edi+8], eax add esp, 16 pop edi %endif ret %endmacro %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 CHECK_SPLIT r4m, %1, %3 jmp x264_pixel_sad_x4_%1x%2_%4 .split: %ifdef ARCH_X86_64 mov r11, r6m push r4 push r3 push r2 mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 mov r10, r0 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax pop r2 mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax pop r2 mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax pop r2 mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax %else push edi mov edi, [esp+32] push dword [esp+28] push dword [esp+16] push dword 16 push dword [esp+20] call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+40] mov [edi+8], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov [edi+12], eax add esp, 16 pop edi %endif ret %endmacro %macro SADX34_CACHELINE_FUNC 5 SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5 SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5 %endmacro ; instantiate the aligned sads %ifndef ARCH_X86_64 SAD16_CACHELINE_FUNC_MMX2 8, 32 SAD16_CACHELINE_FUNC_MMX2 16, 32 SAD8_CACHELINE_FUNC_MMX2 4, 32 SAD8_CACHELINE_FUNC_MMX2 8, 32 SAD8_CACHELINE_FUNC_MMX2 16, 32 SAD16_CACHELINE_FUNC_MMX2 8, 64 SAD16_CACHELINE_FUNC_MMX2 16, 64 %endif ; !ARCH_X86_64 SAD8_CACHELINE_FUNC_MMX2 4, 64 SAD8_CACHELINE_FUNC_MMX2 8, 64 SAD8_CACHELINE_FUNC_MMX2 16, 64 %ifndef ARCH_X86_64 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext %endif ; !ARCH_X86_64 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext %ifndef ARCH_X86_64 SAD16_CACHELINE_FUNC sse2, 8 SAD16_CACHELINE_FUNC sse2, 16 %assign i 1 %rep 15 SAD16_CACHELINE_LOOP_SSE2 i %assign i i+1 %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 %endif ; !ARCH_X86_64 SAD16_CACHELINE_FUNC ssse3, 8 SAD16_CACHELINE_FUNC ssse3, 16 %assign i 1 %rep 15 SAD16_CACHELINE_LOOP_SSSE3 i %assign i i+1 %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3