;*****************************************************************************
-;* sad-a.asm: h264 encoder library
+;* sad-a.asm: x86 sad functions
;*****************************************************************************
-;* Copyright (C) 2003-2008 x264 project
+;* Copyright (C) 2003-2015 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;* Laurent Aimar <fenrir@via.ecp.fr>
;* Fiona Glaser <fiona@x264.com>
+;* Laurent Aimar <fenrir@via.ecp.fr>
;* Alex Izvorski <aizvorksi@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-pb_3: times 16 db 3
-sw_64: dd 64
+SECTION_RODATA 32
+
+pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
+cextern pb_3
+cextern pb_shuf8x8c
+cextern pw_8
+cextern sw_64
+
;=============================================================================
; SAD MMX
;=============================================================================
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD 2
-cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
+cglobal pixel_sad_%1x%2_mmx2, 4,4
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
SAD 8, 16
SAD 8, 8
SAD 8, 4
+SAD 4, 16
SAD 4, 8
SAD 4, 4
RET
%endmacro
-%macro SAD_W16 1
+%macro SAD_W16 0
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1, 4,4,8
- movdqu m0, [r2]
- movdqu m1, [r2+r3]
+cglobal pixel_sad_16x16, 4,4,8
+ movu m0, [r2]
+ movu m1, [r2+r3]
lea r2, [r2+2*r3]
- movdqu m2, [r2]
- movdqu m3, [r2+r3]
+ movu m2, [r2]
+ movu m3, [r2+r3]
lea r2, [r2+2*r3]
psadbw m0, [r0]
psadbw m1, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m4, [r2]
+ movu m4, [r2]
paddw m0, m1
psadbw m2, [r0]
psadbw m3, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m5, [r2+r3]
+ movu m5, [r2+r3]
lea r2, [r2+2*r3]
paddw m2, m3
- movdqu m6, [r2]
- movdqu m7, [r2+r3]
+ movu m6, [r2]
+ movu m7, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m2
psadbw m4, [r0]
psadbw m5, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m1, [r2]
+ movu m1, [r2]
paddw m4, m5
psadbw m6, [r0]
psadbw m7, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m2, [r2+r3]
+ movu m2, [r2+r3]
lea r2, [r2+2*r3]
paddw m6, m7
- movdqu m3, [r2]
+ movu m3, [r2]
paddw m0, m4
- movdqu m4, [r2+r3]
+ movu m4, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m6
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m5, [r2]
+ movu m5, [r2]
paddw m1, m2
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
- movdqu m6, [r2+r3]
+ movu m6, [r2+r3]
lea r2, [r2+2*r3]
paddw m3, m4
- movdqu m7, [r2]
+ movu m7, [r2]
paddw m0, m1
- movdqu m1, [r2+r3]
+ movu m1, [r2+r3]
paddw m0, m3
psadbw m5, [r0]
psadbw m6, [r0+r1]
SAD_END_SSE2
;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_%1, 4,4
- movdqu m0, [r2]
- movdqu m2, [r2+r3]
+cglobal pixel_sad_16x8, 4,4
+ movu m0, [r2]
+ movu m2, [r2+r3]
lea r2, [r2+2*r3]
- movdqu m3, [r2]
- movdqu m4, [r2+r3]
+ movu m3, [r2]
+ movu m4, [r2+r3]
psadbw m0, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
paddw m0, m2
paddw m3, m4
paddw m0, m3
- movdqu m1, [r2]
- movdqu m2, [r2+r3]
+ movu m1, [r2]
+ movu m2, [r2+r3]
lea r2, [r2+2*r3]
- movdqu m3, [r2]
- movdqu m4, [r2+r3]
+ movu m3, [r2]
+ movu m4, [r2+r3]
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
SAD_END_SSE2
%endmacro
-INIT_XMM
-SAD_W16 sse2
-%define movdqu lddqu
-SAD_W16 sse3
-%define movdqu movdqa
-SAD_W16 sse2_aligned
-%undef movdqu
+INIT_XMM sse2
+SAD_W16
+INIT_XMM sse3
+SAD_W16
+INIT_XMM sse2, aligned
+SAD_W16
%macro SAD_INC_4x8P_SSE 1
movq m1, [r0]
psadbw m1, m3
psadbw m2, m4
lea r2, [r2+2*r3]
-%if %1
- paddw m0, m1
-%else
- SWAP m0, m1
-%endif
+ ACCUM paddw, 0, 1, %1
paddw m0, m2
%endmacro
+INIT_XMM
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
-cglobal x264_pixel_sad_8x16_sse2, 4,4
+cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 0
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
RET
;-----------------------------------------------------------------------------
-; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void pixel_vsad( pixel *src, intptr_t stride );
+;-----------------------------------------------------------------------------
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+cglobal pixel_vsad_mmx2, 3,3
+ mova m0, [r0]
+ mova m1, [r0+8]
+ mova m2, [r0+r1]
+ mova m3, [r0+r1+8]
+ lea r0, [r0+r1*2]
+ psadbw m0, m2
+ psadbw m1, m3
+ paddw m0, m1
+ sub r2d, 2
+ je .end
+.loop:
+ mova m4, [r0]
+ mova m5, [r0+8]
+ mova m6, [r0+r1]
+ mova m7, [r0+r1+8]
+ lea r0, [r0+r1*2]
+ psadbw m2, m4
+ psadbw m3, m5
+ psadbw m4, m6
+ psadbw m5, m7
+ ;max sum: 31*16*255(pixel_max)=126480
+ paddd m0, m2
+ paddd m0, m3
+ paddd m0, m4
+ paddd m0, m5
+ mova m2, m6
+ mova m3, m7
+ sub r2d, 2
+ jg .loop
+.end:
+ movd eax, m0
+ RET
+%endif
+
+INIT_XMM
+cglobal pixel_vsad_sse2, 3,3
+ mova m0, [r0]
+ mova m1, [r0+r1]
+ lea r0, [r0+r1*2]
+ psadbw m0, m1
+ sub r2d, 2
+ je .end
+.loop:
+ mova m2, [r0]
+ mova m3, [r0+r1]
+ lea r0, [r0+r1*2]
+ psadbw m1, m2
+ psadbw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+ sub r2d, 2
+ jg .loop
+.end:
+ movhlps m1, m0
+ ;max sum: 31*16*255(pixel_max)=126480
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+cglobal intra_sad_x3_4x4_mmx2, 3,3
+ pxor mm7, mm7
+ movd mm0, [r1-FDEC_STRIDE]
+ movd mm1, [r0+FENC_STRIDE*0]
+ movd mm2, [r0+FENC_STRIDE*2]
+ punpckldq mm0, mm0
+ punpckldq mm1, [r0+FENC_STRIDE*1]
+ punpckldq mm2, [r0+FENC_STRIDE*3]
+ movq mm6, mm0
+ movq mm3, mm1
+ psadbw mm3, mm0
+ psadbw mm0, mm2
+ paddw mm0, mm3
+ movd [r2], mm0 ;V prediction cost
+ movd mm3, [r1+FDEC_STRIDE*0-4]
+ movd mm0, [r1+FDEC_STRIDE*1-4]
+ movd mm4, [r1+FDEC_STRIDE*2-4]
+ movd mm5, [r1+FDEC_STRIDE*3-4]
+ punpcklbw mm3, mm0
+ punpcklbw mm4, mm5
+ movq mm5, mm3
+ punpckhwd mm5, mm4
+ punpckhdq mm5, mm6
+ psadbw mm5, mm7
+ punpckhbw mm3, mm3
+ punpckhbw mm4, mm4
+ punpckhwd mm3, mm3
+ punpckhwd mm4, mm4
+ psraw mm5, 2
+ pavgw mm5, mm7
+ punpcklbw mm5, mm5
+ pshufw mm5, mm5, 0 ;DC prediction
+ movq mm6, mm5
+ psadbw mm5, mm1
+ psadbw mm6, mm2
+ psadbw mm1, mm3
+ psadbw mm2, mm4
+ paddw mm5, mm6
+ paddw mm1, mm2
+ movd [r2+8], mm5 ;DC prediction cost
+ movd [r2+4], mm1 ;H prediction cost
+ RET
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
+;-----------------------------------------------------------------------------
+
+;m0 = DC
+;m6 = V
+;m7 = H
+;m1 = DC score
+;m2 = V score
+;m3 = H score
+;m5 = pixel row
+;m4 = temp
+
+%macro INTRA_SAD_HVDC_ITER 2
+ movq m5, [r0+FENC_STRIDE*%1]
+ movq m4, m5
+ psadbw m4, m0
+ ACCUM paddw, 1, 4, %1
+ movq m4, m5
+ psadbw m4, m6
+ ACCUM paddw, 2, 4, %1
+ pshufw m4, m7, %2
+ psadbw m5, m4
+ ACCUM paddw, 3, 5, %1
+%endmacro
+
+INIT_MMX
+cglobal intra_sad_x3_8x8_mmx2, 3,3
+ movq m7, [r1+7]
+ pxor m0, m0
+ movq m6, [r1+16] ;V prediction
+ pxor m1, m1
+ psadbw m0, m7
+ psadbw m1, m6
+ paddw m0, m1
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ punpcklbw m0, m0
+ pshufw m0, m0, q0000 ;DC prediction
+ punpckhbw m7, m7
+ INTRA_SAD_HVDC_ITER 0, q3333
+ INTRA_SAD_HVDC_ITER 1, q2222
+ INTRA_SAD_HVDC_ITER 2, q1111
+ INTRA_SAD_HVDC_ITER 3, q0000
+ movq m7, [r1+7]
+ punpcklbw m7, m7
+ INTRA_SAD_HVDC_ITER 4, q3333
+ INTRA_SAD_HVDC_ITER 5, q2222
+ INTRA_SAD_HVDC_ITER 6, q1111
+ INTRA_SAD_HVDC_ITER 7, q0000
+ movd [r2+0], m2
+ movd [r2+4], m3
+ movd [r2+8], m1
+ RET
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_HV_ITER 1
+%if cpuflag(ssse3)
+ movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
+ movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
+ pshufb m1, m7
+ pshufb m3, m7
+%else
+ movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
+ movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
+ punpckhbw m1, m1
+ punpckhbw m3, m3
+ pshufw m1, m1, q3333
+ pshufw m3, m3, q3333
+%endif
+ movq m4, [r0 + FENC_STRIDE*(%1+0)]
+ movq m5, [r0 + FENC_STRIDE*(%1+1)]
+ psadbw m1, m4
+ psadbw m3, m5
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m1, m3
+ paddw m4, m5
+ ACCUM paddw, 0, 1, %1
+ ACCUM paddw, 2, 4, %1
+%endmacro
+
+%macro INTRA_SAD_8x8C 0
+cglobal intra_sad_x3_8x8c, 3,3
+ movq m6, [r1 - FDEC_STRIDE]
+ add r1, FDEC_STRIDE*4
+%if cpuflag(ssse3)
+ movq m7, [pb_3]
+%endif
+ INTRA_SAD_HV_ITER 0
+ INTRA_SAD_HV_ITER 2
+ INTRA_SAD_HV_ITER 4
+ INTRA_SAD_HV_ITER 6
+ movd [r2+4], m0
+ movd [r2+8], m2
+ pxor m7, m7
+ movq m2, [r1 + FDEC_STRIDE*-4 - 8]
+ movq m4, [r1 + FDEC_STRIDE*-2 - 8]
+ movq m3, [r1 + FDEC_STRIDE* 0 - 8]
+ movq m5, [r1 + FDEC_STRIDE* 2 - 8]
+ punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
+ punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
+ punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
+ punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
+ punpckhbw m2, m4
+ punpckhbw m3, m5
+ psrlq m2, 32
+ psrlq m3, 32
+ psadbw m2, m7 ; s2
+ psadbw m3, m7 ; s3
+ movq m1, m6
+ SWAP 0, 6
+ punpckldq m0, m7
+ punpckhdq m1, m7
+ psadbw m0, m7 ; s0
+ psadbw m1, m7 ; s1
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckldq m0, m2 ;s0 s1 s2 s3
+ pshufw m3, m0, q3312 ;s2,s1,s3,s3
+ pshufw m0, m0, q1310 ;s0,s1,s3,s1
+ paddw m0, m3
+ psrlw m0, 2
+ pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
+%if cpuflag(ssse3)
+ movq2dq xmm0, m0
+ pshufb xmm0, [pb_shuf8x8c]
+ movq xmm1, [r0+FENC_STRIDE*0]
+ movq xmm2, [r0+FENC_STRIDE*1]
+ movq xmm3, [r0+FENC_STRIDE*2]
+ movq xmm4, [r0+FENC_STRIDE*3]
+ movhps xmm1, [r0+FENC_STRIDE*4]
+ movhps xmm2, [r0+FENC_STRIDE*5]
+ movhps xmm3, [r0+FENC_STRIDE*6]
+ movhps xmm4, [r0+FENC_STRIDE*7]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movd [r2], xmm1
+%else
+ packuswb m0, m0
+ punpcklbw m0, m0
+ movq m1, m0
+ punpcklbw m0, m0 ; 4x dc0 4x dc1
+ punpckhbw m1, m1 ; 4x dc2 4x dc3
+ movq m2, [r0+FENC_STRIDE*0]
+ movq m3, [r0+FENC_STRIDE*1]
+ movq m4, [r0+FENC_STRIDE*2]
+ movq m5, [r0+FENC_STRIDE*3]
+ movq m6, [r0+FENC_STRIDE*4]
+ movq m7, [r0+FENC_STRIDE*5]
+ psadbw m2, m0
+ psadbw m3, m0
+ psadbw m4, m0
+ psadbw m5, m0
+ movq m0, [r0+FENC_STRIDE*6]
+ psadbw m6, m1
+ psadbw m7, m1
+ psadbw m0, m1
+ psadbw m1, [r0+FENC_STRIDE*7]
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ paddw m0, m1
+ paddw m2, m4
+ paddw m6, m0
+ paddw m2, m6
+ movd [r2], m2
+%endif
+ RET
+%endmacro
+
+INIT_MMX mmx2
+INTRA_SAD_8x8C
+INIT_MMX ssse3
+INTRA_SAD_8x8C
+
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8c, 3,3,7
+ vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
+ add r1, FDEC_STRIDE*4-1
+ pxor xm5, xm5
+ punpckldq xm3, xm2, xm5 ; V0 _ V1 _
+ movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
+ movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
+ pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
+ pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
+ pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
+ pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
+ pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
+ pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
+ punpcklqdq xm0, xm1 ; H0 _ H1 _
+ vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
+ pshufb xm0, [hpred_shuf] ; H00224466 H11335577
+ psadbw m3, m5 ; s0 s1 s2 s3
+ vpermq m4, m3, q3312 ; s2 s1 s3 s3
+ vpermq m3, m3, q1310 ; s0 s1 s3 s1
+ paddw m3, m4
+ psrlw m3, 2
+ pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
+ pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
+ vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
+ vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
+ vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
+ vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
+ movddup m2, [r0+FENC_STRIDE*0]
+ movddup m4, [r0+FENC_STRIDE*2]
+ pshuflw m3, m0, q0000
+ psadbw m3, m2
+ psadbw m2, m1
+ pshuflw m5, m0, q1111
+ psadbw m5, m4
+ psadbw m4, m1
+ paddw m2, m4
+ paddw m3, m5
+ movddup m4, [r0+FENC_STRIDE*4]
+ pshuflw m5, m0, q2222
+ psadbw m5, m4
+ psadbw m4, m6
+ paddw m2, m4
+ paddw m3, m5
+ movddup m4, [r0+FENC_STRIDE*6]
+ pshuflw m5, m0, q3333
+ psadbw m5, m4
+ psadbw m4, m6
+ paddw m2, m4
+ paddw m3, m5
+ vextracti128 xm0, m2, 1
+ vextracti128 xm1, m3, 1
+ paddw xm2, xm0 ; DC V
+ paddw xm3, xm1 ; H
+ pextrd [r2+8], xm2, 2 ; V
+ movd [r2+4], xm3 ; H
+ movd [r2+0], xm2 ; DC
+ RET
+
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
-%macro INTRA_SAD16 1-2 0
-cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
+%macro INTRA_SAD16 0
+cglobal intra_sad_x3_16x16, 3,5,8
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
psadbw mm1, [r1-FDEC_STRIDE+8]
paddw mm0, mm1
movd r3d, mm0
-%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+%if cpuflag(ssse3)
+ mova m1, [pb_3]
%endif
-%assign n 0
+%assign x 0
%rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*n]
+ movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+ add r1, FDEC_STRIDE*4
+%endif
add r3d, r4d
-%assign n n+1
+%assign x x+1
%endrep
+ sub r1, FDEC_STRIDE*12
add r3d, 16
shr r3d, 5
imul r3d, 0x01010101
pxor m2, m2
mov r3d, 15*FENC_STRIDE
.vloop:
- SPLATB m6, r1+r3*2-1, m1
+ SPLATB_LOAD m6, r1+r3*2-1, m1
mova m0, [r0+r3]
psadbw m0, m7
paddw m4, m0
RET
%endmacro
-INIT_MMX
-%define SPLATB SPLATB_MMX
-INTRA_SAD16 mmxext
-INIT_XMM
-INTRA_SAD16 sse2, 8
-%define SPLATB SPLATB_SSSE3
-INTRA_SAD16 ssse3, 8
-
-
+INIT_MMX mmx2
+INTRA_SAD16
+INIT_XMM sse2
+INTRA_SAD16
+INIT_XMM ssse3
+INTRA_SAD16
+
+INIT_YMM avx2
+cglobal intra_sad_x3_16x16, 3,5,6
+ pxor xm0, xm0
+ psadbw xm0, [r1-FDEC_STRIDE]
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ movd r3d, xm0
+%assign x 0
+%rep 16
+ movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+ add r1, FDEC_STRIDE*4
+%endif
+ add r3d, r4d
+%assign x x+1
+%endrep
+ sub r1, FDEC_STRIDE*12
+ add r3d, 16
+ shr r3d, 5
+ movd xm5, r3d
+ vpbroadcastb xm5, xm5
+ vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
+
+ pxor m4, m4 ; DC / V accumulator
+ pxor xm3, xm3 ; H accumulator
+ mov r3d, 15*FENC_STRIDE
+.vloop:
+ vpbroadcastb xm2, [r1+r3*2-1]
+ vbroadcasti128 m0, [r0+r3]
+ psadbw m1, m0, m5
+ psadbw xm0, xm2
+ paddw m4, m1
+ paddw xm3, xm0
+ add r3d, -FENC_STRIDE
+ jge .vloop
+ punpckhqdq m5, m4, m4
+ movhlps xm2, xm3
+ paddw m4, m5 ; DC / V
+ paddw xm3, xm2 ; H
+ vextracti128 xm2, m4, 1
+ movd [r2+0], xm2
+ movd [r2+4], xm3
+ movd [r2+8], xm4
+ RET
;=============================================================================
; SAD x3/x4 MMX
%endmacro
%macro SAD_X3_END 0
-%ifdef UNIX64
+%if UNIX64
movd [r5+0], mm0
movd [r5+4], mm1
movd [r5+8], mm2
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
-%ifdef WIN64
- %assign i %1+1
- movsxd r %+ i, r %+ i %+ d
-%endif
+cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
SAD_X%1_END
%endmacro
+INIT_MMX
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
;=============================================================================
%macro SAD_X3_START_1x16P_SSE2 0
- movdqa xmm3, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
+ mova m2, [r0]
+%if cpuflag(avx)
+ psadbw m0, m2, [r1]
+ psadbw m1, m2, [r2]
+ psadbw m2, [r3]
+%else
+ movu m0, [r1]
+ movu m1, [r2]
+ movu m3, [r3]
+ psadbw m0, m2
+ psadbw m1, m2
+ psadbw m2, m3
+%endif
%endmacro
%macro SAD_X3_1x16P_SSE2 2
- movdqa xmm3, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm6, xmm3
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
-%endmacro
-
-%macro SAD_X3_2x16P_SSE2 1
-%if %1
+ mova m3, [r0+%1]
+%if cpuflag(avx)
+ psadbw m4, m3, [r1+%2]
+ psadbw m5, m3, [r2+%2]
+ psadbw m3, [r3+%2]
+%else
+ movu m4, [r1+%2]
+ movu m5, [r2+%2]
+ movu m6, [r3+%2]
+ psadbw m4, m3
+ psadbw m5, m3
+ psadbw m3, m6
+%endif
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m3
+%endmacro
+
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 5
+%endif
+
+%macro SAD_X3_4x16P_SSE2 2
+%if %1==0
+ lea t0, [r4*3]
SAD_X3_START_1x16P_SSE2
%else
- SAD_X3_1x16P_SSE2 0, 0
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
+%endif
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
+ SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
%endif
- SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
%endmacro
%macro SAD_X3_START_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm0, [r1]
- movq xmm1, [r2]
- movq xmm2, [r3]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm0, [r1+r4]
- movhps xmm1, [r2+r4]
- movhps xmm2, [r3+r4]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
-%endmacro
-
-%macro SAD_X3_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm3, [r1]
- movq xmm4, [r2]
- movq xmm5, [r3]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm3, [r1+r4]
- movhps xmm4, [r2+r4]
- movhps xmm5, [r3+r4]
- psadbw xmm3, xmm7
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm3
- paddw xmm1, xmm4
- paddw xmm2, xmm5
+ movq m3, [r0]
+ movq m0, [r1]
+ movq m1, [r2]
+ movq m2, [r3]
+ movhps m3, [r0+FENC_STRIDE]
+ movhps m0, [r1+r4]
+ movhps m1, [r2+r4]
+ movhps m2, [r3+r4]
+ psadbw m0, m3
+ psadbw m1, m3
+ psadbw m2, m3
+%endmacro
+
+%macro SAD_X3_2x8P_SSE2 4
+ movq m6, [r0+%1]
+ movq m3, [r1+%2]
+ movq m4, [r2+%2]
+ movq m5, [r3+%2]
+ movhps m6, [r0+%3]
+ movhps m3, [r1+%4]
+ movhps m4, [r2+%4]
+ movhps m5, [r3+%4]
+ psadbw m3, m6
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
%endmacro
%macro SAD_X4_START_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm0, [r1]
- movq xmm1, [r2]
- movq xmm2, [r3]
- movq xmm3, [r4]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm0, [r1+r5]
- movhps xmm1, [r2+r5]
- movhps xmm2, [r3+r5]
- movhps xmm3, [r4+r5]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
-%endmacro
-
-%macro SAD_X4_2x8P_SSE2 0
- movq xmm7, [r0]
- movq xmm4, [r1]
- movq xmm5, [r2]
-%ifdef ARCH_X86_64
- movq xmm6, [r3]
- movq xmm8, [r4]
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm4, [r1+r5]
- movhps xmm5, [r2+r5]
- movhps xmm6, [r3+r5]
- movhps xmm8, [r4+r5]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm8, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm8
-%else
- movhps xmm7, [r0+FENC_STRIDE]
- movhps xmm4, [r1+r5]
- movhps xmm5, [r2+r5]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- movq xmm6, [r3]
- movq xmm4, [r4]
- movhps xmm6, [r3+r5]
- movhps xmm4, [r4+r5]
- psadbw xmm6, xmm7
- psadbw xmm4, xmm7
- paddw xmm2, xmm6
- paddw xmm3, xmm4
-%endif
+ movq m4, [r0]
+ movq m0, [r1]
+ movq m1, [r2]
+ movq m2, [r3]
+ movq m3, [r4]
+ movhps m4, [r0+FENC_STRIDE]
+ movhps m0, [r1+r5]
+ movhps m1, [r2+r5]
+ movhps m2, [r3+r5]
+ movhps m3, [r4+r5]
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
+%endmacro
+
+%macro SAD_X4_2x8P_SSE2 4
+ movq m6, [r0+%1]
+ movq m4, [r1+%2]
+ movq m5, [r2+%2]
+ movhps m6, [r0+%3]
+ movhps m4, [r1+%4]
+ movhps m5, [r2+%4]
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m0, m4
+ paddw m1, m5
+ movq m4, [r3+%2]
+ movq m5, [r4+%2]
+ movhps m4, [r3+%4]
+ movhps m5, [r4+%4]
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m2, m4
+ paddw m3, m5
%endmacro
%macro SAD_X4_START_1x16P_SSE2 0
- movdqa xmm7, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- movdqu xmm3, [r4]
- psadbw xmm0, xmm7
- psadbw xmm1, xmm7
- psadbw xmm2, xmm7
- psadbw xmm3, xmm7
+ mova m3, [r0]
+%if cpuflag(avx)
+ psadbw m0, m3, [r1]
+ psadbw m1, m3, [r2]
+ psadbw m2, m3, [r3]
+ psadbw m3, [r4]
+%else
+ movu m0, [r1]
+ movu m1, [r2]
+ movu m2, [r3]
+ movu m4, [r4]
+ psadbw m0, m3
+ psadbw m1, m3
+ psadbw m2, m3
+ psadbw m3, m4
+%endif
%endmacro
%macro SAD_X4_1x16P_SSE2 2
- movdqa xmm7, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
-%ifdef ARCH_X86_64
- movdqu xmm8, [r4+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm8, xmm7
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm8
+ mova m6, [r0+%1]
+%if cpuflag(avx)
+ psadbw m4, m6, [r1+%2]
+ psadbw m5, m6, [r2+%2]
+%else
+ movu m4, [r1+%2]
+ movu m5, [r2+%2]
+ psadbw m4, m6
+ psadbw m5, m6
+%endif
+ paddw m0, m4
+ paddw m1, m5
+%if cpuflag(avx)
+ psadbw m4, m6, [r3+%2]
+ psadbw m5, m6, [r4+%2]
%else
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- paddw xmm0, xmm4
- psadbw xmm6, xmm7
- movdqu xmm4, [r4+%2]
- paddw xmm1, xmm5
- psadbw xmm4, xmm7
- paddw xmm2, xmm6
- paddw xmm3, xmm4
+ movu m4, [r3+%2]
+ movu m5, [r4+%2]
+ psadbw m4, m6
+ psadbw m5, m6
%endif
+ paddw m2, m4
+ paddw m3, m5
%endmacro
-%macro SAD_X4_2x16P_SSE2 1
-%if %1
+%macro SAD_X4_4x16P_SSE2 2
+%if %1==0
+ lea r6, [r5*3]
SAD_X4_START_1x16P_SSE2
%else
- SAD_X4_1x16P_SSE2 0, 0
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
+%endif
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
+ SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
%endif
- SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
%endmacro
-%macro SAD_X3_2x8P_SSE2 1
-%if %1
+%macro SAD_X3_4x8P_SSE2 2
+%if %1==0
+ lea t0, [r4*3]
SAD_X3_START_2x8P_SSE2
%else
- SAD_X3_2x8P_SSE2
+ SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
+%endif
+ SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
%endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
%endmacro
-%macro SAD_X4_2x8P_SSE2 1
-%if %1
+%macro SAD_X4_4x8P_SSE2 2
+%if %1==0
+ lea r6, [r5*3]
SAD_X4_START_2x8P_SSE2
%else
- SAD_X4_2x8P_SSE2
+ SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
%endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
%endmacro
%macro SAD_X3_END_SSE2 0
- movhlps xmm4, xmm0
- movhlps xmm5, xmm1
- movhlps xmm6, xmm2
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
-%ifdef UNIX64
- movd [r5+0], xmm0
- movd [r5+4], xmm1
- movd [r5+8], xmm2
+ movifnidn r5, r5mp
+%if cpuflag(ssse3)
+ packssdw m0, m1
+ packssdw m2, m2
+ phaddd m0, m2
+ mova [r5], m0
%else
- mov r0, r5mp
- movd [r0+0], xmm0
- movd [r0+4], xmm1
- movd [r0+8], xmm2
+ movhlps m3, m0
+ movhlps m4, m1
+ movhlps m5, m2
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+ movd [r5+0], m0
+ movd [r5+4], m1
+ movd [r5+8], m2
%endif
RET
%endmacro
%macro SAD_X4_END_SSE2 0
- mov r0, r6mp
- psllq xmm1, 32
- psllq xmm3, 32
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- movq [r0+0], xmm0
- movq [r0+8], xmm2
+ mov r0, r6mp
+%if cpuflag(ssse3)
+ packssdw m0, m1
+ packssdw m2, m3
+ phaddd m0, m2
+ mova [r0], m0
+%else
+ psllq m1, 32
+ psllq m3, 32
+ paddw m0, m1
+ paddw m2, m3
+ movhlps m1, m0
+ movhlps m3, m2
+ paddw m0, m1
+ paddw m2, m3
+ movq [r0+0], m0
+ movq [r0+8], m2
+%endif
RET
%endmacro
-%macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
- movdqa xmm2, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm2
- psadbw xmm2, [r3]
-%endmacro
-
-%macro SAD_X3_1x16P_SSE2_MISALIGN 2
- movdqa xmm3, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- psadbw xmm4, xmm3
- psadbw xmm5, xmm3
- psadbw xmm3, [r3+%2]
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm3
-%endmacro
-
-%macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
- movdqa xmm3, [r0]
- movdqu xmm0, [r1]
- movdqu xmm1, [r2]
- movdqu xmm2, [r3]
- psadbw xmm0, xmm3
- psadbw xmm1, xmm3
- psadbw xmm2, xmm3
- psadbw xmm3, [r4]
-%endmacro
-
-%macro SAD_X4_1x16P_SSE2_MISALIGN 2
- movdqa xmm7, [r0+%1]
- movdqu xmm4, [r1+%2]
- movdqu xmm5, [r2+%2]
- movdqu xmm6, [r3+%2]
- psadbw xmm4, xmm7
- psadbw xmm5, xmm7
- psadbw xmm6, xmm7
- psadbw xmm7, [r4+%2]
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm6
- paddw xmm3, xmm7
-%endmacro
-
-%macro SAD_X3_2x16P_SSE2_MISALIGN 1
-%if %1
- SAD_X3_START_1x16P_SSE2_MISALIGN
+%macro SAD_X4_START_2x8P_SSSE3 0
+ movddup m4, [r0]
+ movq m0, [r1]
+ movq m1, [r3]
+ movhps m0, [r2]
+ movhps m1, [r4]
+ movddup m5, [r0+FENC_STRIDE]
+ movq m2, [r1+r5]
+ movq m3, [r3+r5]
+ movhps m2, [r2+r5]
+ movhps m3, [r4+r5]
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m5
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+%endmacro
+
+%macro SAD_X4_2x8P_SSSE3 4
+ movddup m6, [r0+%1]
+ movq m2, [r1+%2]
+ movq m3, [r3+%2]
+ movhps m2, [r2+%2]
+ movhps m3, [r4+%2]
+ movddup m7, [r0+%3]
+ movq m4, [r1+%4]
+ movq m5, [r3+%4]
+ movhps m4, [r2+%4]
+ movhps m5, [r4+%4]
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+%endmacro
+
+%macro SAD_X4_4x8P_SSSE3 2
+%if %1==0
+ lea r6, [r5*3]
+ SAD_X4_START_2x8P_SSSE3
%else
- SAD_X3_1x16P_SSE2_MISALIGN 0, 0
+ SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
%endif
- SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
%endmacro
-%macro SAD_X4_2x16P_SSE2_MISALIGN 1
-%if %1
- SAD_X4_START_1x16P_SSE2_MISALIGN
+%macro SAD_X4_END_SSSE3 0
+ mov r0, r6mp
+ packssdw m0, m1
+ mova [r0], m0
+ RET
+%endmacro
+
+%macro SAD_X3_START_2x16P_AVX2 0
+ movu m3, [r0] ; assumes FENC_STRIDE == 16
+ movu xm0, [r1]
+ movu xm1, [r2]
+ movu xm2, [r3]
+ vinserti128 m0, m0, [r1+r4], 1
+ vinserti128 m1, m1, [r2+r4], 1
+ vinserti128 m2, m2, [r3+r4], 1
+ psadbw m0, m3
+ psadbw m1, m3
+ psadbw m2, m3
+%endmacro
+
+%macro SAD_X3_2x16P_AVX2 3
+ movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
+ movu xm4, [r1+%2]
+ movu xm5, [r2+%2]
+ movu xm6, [r3+%2]
+ vinserti128 m4, m4, [r1+%3], 1
+ vinserti128 m5, m5, [r2+%3], 1
+ vinserti128 m6, m6, [r3+%3], 1
+ psadbw m4, m3
+ psadbw m5, m3
+ psadbw m6, m3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+%endmacro
+
+%macro SAD_X3_4x16P_AVX2 2
+%if %1==0
+ lea t0, [r4*3]
+ SAD_X3_START_2x16P_AVX2
%else
- SAD_X4_1x16P_SSE2_MISALIGN 0, 0
+ SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
+%endif
+ SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
%endif
- SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
+%endif
+%endmacro
+
+%macro SAD_X4_START_2x16P_AVX2 0
+ vbroadcasti128 m4, [r0]
+ vbroadcasti128 m5, [r0+FENC_STRIDE]
+ movu xm0, [r1]
+ movu xm1, [r2]
+ movu xm2, [r1+r5]
+ movu xm3, [r2+r5]
+ vinserti128 m0, m0, [r3], 1
+ vinserti128 m1, m1, [r4], 1
+ vinserti128 m2, m2, [r3+r5], 1
+ vinserti128 m3, m3, [r4+r5], 1
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m5
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+%endmacro
+
+%macro SAD_X4_2x16P_AVX2 4
+ vbroadcasti128 m6, [r0+%1]
+ vbroadcasti128 m7, [r0+%3]
+ movu xm2, [r1+%2]
+ movu xm3, [r2+%2]
+ movu xm4, [r1+%4]
+ movu xm5, [r2+%4]
+ vinserti128 m2, m2, [r3+%2], 1
+ vinserti128 m3, m3, [r4+%2], 1
+ vinserti128 m4, m4, [r3+%4], 1
+ vinserti128 m5, m5, [r4+%4], 1
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddw m0, m2
+ paddw m1, m3
+ paddw m0, m4
+ paddw m1, m5
+%endmacro
+
+%macro SAD_X4_4x16P_AVX2 2
+%if %1==0
+ lea r6, [r5*3]
+ SAD_X4_START_2x16P_AVX2
+%else
+ SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
+%endif
+%endmacro
+
+%macro SAD_X3_END_AVX2 0
+ movifnidn r5, r5mp
+ packssdw m0, m1 ; 0 0 1 1 0 0 1 1
+ packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
+ phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1 ; 0 1 2 _
+ mova [r5], xm0
+ RET
+%endmacro
+
+%macro SAD_X4_END_AVX2 0
+ mov r0, r6mp
+ packssdw m0, m1 ; 0 0 1 1 2 2 3 3
+ vextracti128 xm1, m0, 1
+ phaddd xm0, xm1 ; 0 1 2 3
+ mova [r0], xm0
+ RET
%endmacro
;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
-%ifdef WIN64
- %assign i %1+1
- movsxd r %+ i, r %+ i %+ d
-%endif
- SAD_X%1_2x%2P_SSE2 1
-%rep %3/2-1
- SAD_X%1_2x%2P_SSE2 0
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
+%assign x 0
+%rep %3/4
+ SAD_X%1_4x%2P_SSE2 x, %3/4
+%assign x x+1
%endrep
SAD_X%1_END_SSE2
%endmacro
-%macro SAD_X_SSE2_MISALIGN 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
-%ifdef WIN64
- %assign i %1+1
- movsxd r %+ i, r %+ i %+ d
-%endif
- SAD_X%1_2x%2P_SSE2_MISALIGN 1
-%rep %3/2-1
- SAD_X%1_2x%2P_SSE2_MISALIGN 0
+INIT_XMM sse2
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16, 8, 7
+SAD_X_SSE2 3, 8, 16, 7
+SAD_X_SSE2 3, 8, 8, 7
+SAD_X_SSE2 3, 8, 4, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+SAD_X_SSE2 4, 8, 16, 7
+SAD_X_SSE2 4, 8, 8, 7
+SAD_X_SSE2 4, 8, 4, 7
+
+INIT_XMM sse3
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16, 8, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+
+%macro SAD_X_SSSE3 3
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
+%assign x 0
+%rep %3/4
+ SAD_X%1_4x%2P_SSSE3 x, %3/4
+%assign x x+1
%endrep
- SAD_X%1_END_SSE2
+ SAD_X%1_END_SSSE3
%endmacro
-SAD_X_SSE2 3, 16, 16, sse2
-SAD_X_SSE2 3, 16, 8, sse2
-SAD_X_SSE2 3, 8, 16, sse2
-SAD_X_SSE2 3, 8, 8, sse2
-SAD_X_SSE2 3, 8, 4, sse2
-SAD_X_SSE2 4, 16, 16, sse2
-SAD_X_SSE2 4, 16, 8, sse2
-SAD_X_SSE2 4, 8, 16, sse2
-SAD_X_SSE2 4, 8, 8, sse2
-SAD_X_SSE2 4, 8, 4, sse2
-
-SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
-SAD_X_SSE2_MISALIGN 3, 16, 8, sse2
-SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
-SAD_X_SSE2_MISALIGN 4, 16, 8, sse2
-
-%define movdqu lddqu
-SAD_X_SSE2 3, 16, 16, sse3
-SAD_X_SSE2 3, 16, 8, sse3
-SAD_X_SSE2 4, 16, 16, sse3
-SAD_X_SSE2 4, 16, 8, sse3
-%undef movdqu
-
+INIT_XMM ssse3
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16, 8, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+SAD_X_SSSE3 4, 8, 16
+SAD_X_SSSE3 4, 8, 8
+SAD_X_SSSE3 4, 8, 4
+
+INIT_XMM avx
+SAD_X_SSE2 3, 16, 16, 6
+SAD_X_SSE2 3, 16, 8, 6
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+
+%macro SAD_X_AVX2 4
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
+%assign x 0
+%rep %3/4
+ SAD_X%1_4x%2P_AVX2 x, %3/4
+%assign x x+1
+%endrep
+ SAD_X%1_END_AVX2
+%endmacro
+INIT_YMM avx2
+SAD_X_AVX2 3, 16, 16, 7
+SAD_X_AVX2 3, 16, 8, 7
+SAD_X_AVX2 4, 16, 16, 8
+SAD_X_AVX2 4, 16, 8, 8
;=============================================================================
; SAD cacheline split
%endmacro
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal x264_pixel_sad_16x%2_cache64_%1
+cglobal pixel_sad_16x%2_cache64_%1
mov eax, r2m
and eax, 0x37
cmp eax, 0x30
- jle x264_pixel_sad_16x%2_sse2
+ jle pixel_sad_16x%2_sse2
PROLOGUE 4,6
mov r4d, r2d
and r4d, 15
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
%ifdef PIC
- lea r5, [sad_w16_addr GLOBAL]
+ lea r5, [sad_w16_addr]
add r5, r4
%else
- lea r5, [sad_w16_addr + r4 GLOBAL]
+ lea r5, [sad_w16_addr + r4]
%endif
and r2, ~15
mov r4d, %2/2
mov eax, r2m
and eax, 0x17|%1|(%4>>1)
cmp eax, 0x10|%1|(%4>>1)
- jle x264_pixel_sad_%1x%2_mmxext
+ jle pixel_sad_%1x%2_mmx2
and eax, 7
shl eax, 3
- movd mm6, [sw_64 GLOBAL]
+ movd mm6, [sw_64]
movd mm7, eax
psubw mm6, mm7
PROLOGUE 4,5
%endmacro
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_16x%1_cache%2_mmxext
+cglobal pixel_sad_16x%1_cache%2_mmx2
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
movq mm1, [r2]
%endmacro
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_8x%1_cache%2_mmxext
+cglobal pixel_sad_8x%1_cache%2_mmx2
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
movq mm1, [r2+8]
%endmacro
%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
+cglobal pixel_sad_x3_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
- jmp x264_pixel_sad_x3_%1x%2_%4
+ jmp pixel_sad_x3_%1x%2_%4
.split:
-%ifdef ARCH_X86_64
- PROLOGUE 6,7
-%ifdef WIN64
- movsxd r4, r4d
- sub rsp, 8
-%endif
+%if ARCH_X86_64
+ PROLOGUE 6,9
push r3
push r2
+%if WIN64
+ movsxd r4, r4d
+ sub rsp, 40 ; shadow space and alignment
+%endif
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r4
- mov r10, r0
- mov r11, r5
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
-%ifdef WIN64
- mov r2, [rsp]
+ mov r7, r0
+ mov r8, r5
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8], eax
+%if WIN64
+ mov r2, [rsp+40+0*8]
%else
pop r2
%endif
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
-%ifdef WIN64
- mov r2, [rsp+8]
+ mov r0, r7
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8+4], eax
+%if WIN64
+ mov r2, [rsp+40+1*8]
%else
pop r2
%endif
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
-%ifdef WIN64
- add rsp, 24
+ mov r0, r7
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8+8], eax
+%if WIN64
+ add rsp, 40+2*8
%endif
RET
%else
push dword [esp+16]
push dword 16
push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+32]
mov [edi], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+36]
mov [edi+4], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [edi+8], eax
add esp, 16
pop edi
%endmacro
%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
+cglobal pixel_sad_x4_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
CHECK_SPLIT r4m, %1, %3
- jmp x264_pixel_sad_x4_%1x%2_%4
+ jmp pixel_sad_x4_%1x%2_%4
.split:
-%ifdef ARCH_X86_64
- PROLOGUE 6,7
- mov r11, r6mp
-%ifdef WIN64
- movsxd r5, r5d
-%endif
+%if ARCH_X86_64
+ PROLOGUE 6,9
+ mov r8, r6mp
push r4
push r3
push r2
+%if WIN64
+ sub rsp, 32 ; shadow space
+%endif
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r5
- mov r10, r0
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
-%ifdef WIN64
- mov r2, [rsp]
+ mov r7, r0
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8], eax
+%if WIN64
+ mov r2, [rsp+32+0*8]
%else
pop r2
%endif
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
-%ifdef WIN64
- mov r2, [rsp+8]
+ mov r0, r7
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8+4], eax
+%if WIN64
+ mov r2, [rsp+32+1*8]
%else
pop r2
%endif
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
-%ifdef WIN64
- mov r2, [rsp+16]
+ mov r0, r7
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8+8], eax
+%if WIN64
+ mov r2, [rsp+32+2*8]
%else
pop r2
%endif
- mov r0, r10
- call x264_pixel_sad_%1x%2_cache%3_%5
- mov [r11+12], eax
-%ifdef WIN64
- add rsp, 24
+ mov r0, r7
+ call pixel_sad_%1x%2_cache%3_%5
+ mov [r8+12], eax
+%if WIN64
+ add rsp, 32+3*8
%endif
RET
%else
push dword [esp+16]
push dword 16
push dword [esp+20]
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+32]
mov [edi], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+36]
mov [edi+4], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov ecx, [esp+40]
mov [edi+8], eax
mov [esp+8], ecx
- call x264_pixel_sad_%1x%2_cache%3_%5
+ call pixel_sad_%1x%2_cache%3_%5
mov [edi+12], eax
add esp, 16
pop edi
; instantiate the aligned sads
-%ifndef ARCH_X86_64
+INIT_MMX
+%if ARCH_X86_64 == 0
SAD16_CACHELINE_FUNC_MMX2 8, 32
SAD16_CACHELINE_FUNC_MMX2 16, 32
SAD8_CACHELINE_FUNC_MMX2 4, 32
SAD8_CACHELINE_FUNC_MMX2 8, 64
SAD8_CACHELINE_FUNC_MMX2 16, 64
-%ifndef ARCH_X86_64
-SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext
+%if ARCH_X86_64 == 0
+SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext
-SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
+SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
SAD16_CACHELINE_FUNC sse2, 8
SAD16_CACHELINE_FUNC sse2, 16
%assign i 1
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
%endif ; !ARCH_X86_64
-SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2
+SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16