;*****************************************************************************
;* pixel.asm: x86 pixel metrics
;*****************************************************************************
-;* Copyright (C) 2003-2011 x264 project
+;* Copyright (C) 2003-2014 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
%include "x86util.asm"
SECTION_RODATA 32
+hmul_16p: times 16 db 1
+ times 8 db 1, -1
+hmul_8p: times 8 db 1
+ times 4 db 1, -1
+ times 8 db 1
+ times 4 db 1, -1
mask_ff: times 16 db 0xff
times 16 db 0
+mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
+mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
+mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
%if BIT_DEPTH == 10
ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
%endif
-mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
-mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
-mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-hmul_8p: times 8 db 1
- times 4 db 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
pb_pppm: times 4 db 1,1,1,-1
intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
+ALIGN 32
intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
sw_f0: dq 0xfff0, 0
-sq_0f: dq 0xffffffff, 0
pd_f0: times 4 dd 0xffff0000
+pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
+
+ads_mvs_shuffle:
+%macro ADS_MVS_SHUFFLE 8
+ %assign y x
+ %rep 8
+ %rep 7
+ %rotate (~y)&1
+ %assign y y>>((~y)&1)
+ %endrep
+ db %1*2, %1*2+1
+ %rotate 1
+ %assign y y>>1
+ %endrep
+%endmacro
+%assign x 0
+%rep 256
+ ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
+%assign x x+1
+%endrep
+
SECTION .text
cextern pb_0
cextern pw_1
cextern pw_8
cextern pw_16
-cextern pw_64
+cextern pw_32
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
cextern pw_pmmpzzzz
+cextern pd_1
cextern hsub_mul
+cextern popcnt_table
;=============================================================================
; SSD
;=============================================================================
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SSD_ONE 2
-cglobal pixel_ssd_%1x%2, 4,5,6
- mov r4, %1*%2/mmsize
+cglobal pixel_ssd_%1x%2, 4,7,6
+ FIX_STRIDES r1, r3
+%if mmsize == %1*2
+ %define offset0_1 r1
+ %define offset0_2 r1*2
+ %define offset0_3 r5
+ %define offset1_1 r3
+ %define offset1_2 r3*2
+ %define offset1_3 r6
+ lea r5, [3*r1]
+ lea r6, [3*r3]
+%elif mmsize == %1
+ %define offset0_1 mmsize
+ %define offset0_2 r1
+ %define offset0_3 r1+mmsize
+ %define offset1_1 mmsize
+ %define offset1_2 r3
+ %define offset1_3 r3+mmsize
+%elif mmsize == %1/2
+ %define offset0_1 mmsize
+ %define offset0_2 mmsize*2
+ %define offset0_3 mmsize*3
+ %define offset1_1 mmsize
+ %define offset1_2 mmsize*2
+ %define offset1_3 mmsize*3
+%endif
+ %assign %%n %2/(2*mmsize/%1)
+%if %%n > 1
+ mov r4d, %%n
+%endif
pxor m0, m0
-.loop
+.loop:
mova m1, [r0]
-%if %1 <= mmsize/2
- mova m3, [r0+r1*2]
- %define offset r3*2
- %define num_rows 2
-%else
- mova m3, [r0+mmsize]
- %define offset mmsize
- %define num_rows 1
-%endif
- lea r0, [r0+r1*2*num_rows]
+ mova m2, [r0+offset0_1]
+ mova m3, [r0+offset0_2]
+ mova m4, [r0+offset0_3]
psubw m1, [r2]
- psubw m3, [r2+offset]
- lea r2, [r2+r3*2*num_rows]
+ psubw m2, [r2+offset1_1]
+ psubw m3, [r2+offset1_2]
+ psubw m4, [r2+offset1_3]
+%if %%n > 1
+ lea r0, [r0+r1*(%2/%%n)]
+ lea r2, [r2+r3*(%2/%%n)]
+%endif
pmaddwd m1, m1
+ pmaddwd m2, m2
pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
paddd m0, m1
paddd m0, m3
- dec r4
+%if %%n > 1
+ dec r4d
jg .loop
+%endif
HADDD m0, m5
- movd eax, m0
- RET
-%endmacro
-
-%macro SSD_16_MMX 2
-cglobal pixel_ssd_%1x%2, 4,5
- mov r4, %1*%2/mmsize/2
- pxor m0, m0
-.loop
- mova m1, [r0]
- mova m2, [r2]
- mova m3, [r0+mmsize]
- mova m4, [r2+mmsize]
- mova m5, [r0+mmsize*2]
- mova m6, [r2+mmsize*2]
- mova m7, [r0+mmsize*3]
- psubw m1, m2
- psubw m3, m4
- mova m2, [r2+mmsize*3]
- psubw m5, m6
- pmaddwd m1, m1
- psubw m7, m2
- pmaddwd m3, m3
- pmaddwd m5, m5
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- pmaddwd m7, m7
- paddd m1, m3
- paddd m5, m7
- paddd m0, m1
- paddd m0, m5
- dec r4
- jg .loop
- HADDD m0, m7
- movd eax, m0
+ movd eax, xm0
RET
%endmacro
SSD_ONE 8, 4
SSD_ONE 8, 8
SSD_ONE 8, 16
-SSD_16_MMX 16, 8
-SSD_16_MMX 16, 16
+SSD_ONE 16, 8
+SSD_ONE 16, 16
INIT_XMM sse2
SSD_ONE 8, 4
SSD_ONE 8, 8
SSD_ONE 8, 16
SSD_ONE 16, 8
SSD_ONE 16, 16
+INIT_YMM avx2
+SSD_ONE 16, 8
+SSD_ONE 16, 16
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro SSD_LOAD_FULL 5
mova m1, [t0+%1]
mova m2, [t2+%2]
punpcklbw m%2, m%4
%endmacro
+%macro LOAD_AVX2 5
+ mova xm%1, %3
+ vinserti128 m%1, m%1, %4, 1
+%if %5
+ lea t0, [t0+2*t1]
+%endif
+%endmacro
+
+%macro JOIN_AVX2 7
+ mova xm%2, %5
+ vinserti128 m%2, m%2, %6, 1
+%if %7
+ lea t2, [t2+2*t3]
+%endif
+ SBUTTERFLY bw, %1, %2, %3
+%endmacro
+
%macro SSD_LOAD_HALF 5
LOAD 1, 2, [t0+%1], [t0+%3], 1
JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SSD 2
%if %1 != %2
%else
.startloop:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3
PROLOGUE 0,0,8
%else
%endif
dec al
jg .loop
+%if mmsize==32
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ HADDD xm0, xm1
+ movd eax, xm0
+%else
HADDD m0, m1
movd eax, m0
+%endif
RET
%endif
%endmacro
SSD 16, 8
SSD 8, 16
SSD 8, 4
+%define LOAD LOAD_AVX2
+%define JOIN JOIN_AVX2
+INIT_YMM avx2
+SSD 16, 16
+SSD 16, 8
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2,
+; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; The maximum width this function can handle without risk of overflow is given
; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
; distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7,7
shl r4d, 2
psubw m1, [r2+r6+mmsize]
PSHUFLW m0, m0, q3120
PSHUFLW m1, m1, q3120
-%if mmsize==16
+%if mmsize >= 16
pshufhw m0, m0, q3120
pshufhw m1, m1, q3120
%endif
paddd m3, m1
add r6, 2*mmsize
jl .loopx
-%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
- ; equation above, putting the width limit at 8208
+%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
+ jz .no_overread
+ psubd m3, m1
+.no_overread:
+%endif
+%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
+ ; equation above, putting the width limit at 8208
punpckhdq m0, m2, m6
punpckhdq m1, m3, m6
punpckldq m2, m6
jg .loopy
mov r3, r6m
mov r4, r7m
-%if mmsize==16
- movq [r3], m4
- movhps [r4], m4
+%if mmsize == 32
+ vextracti128 xm0, m4, 1
+ paddq xm4, xm0
+%endif
+%if mmsize >= 16
+ movq [r3], xm4
+ movhps [r4], xm4
%else ; fixup for mmx2
SBUTTERFLY dq, 4, 5, 0
mova m0, m4
%endmacro ; SSD_NV12
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
+; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; This implementation can potentially overflow on image widths >= 11008 (or
;-----------------------------------------------------------------------------
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7
- shl r4d, 1
+ add r4d, r4d
add r0, r4
add r2, r4
pxor m3, m3
mov r6, r4
neg r6
.loopx:
- mova m0, [r0+r6]
+%if mmsize == 32 ; only 16-byte alignment is guaranteed
+ movu m2, [r0+r6]
+ movu m1, [r2+r6]
+%else
+ mova m2, [r0+r6]
mova m1, [r2+r6]
- psubusb m0, m1
- psubusb m1, [r0+r6]
+%endif
+ psubusb m0, m2, m1
+ psubusb m1, m2
por m0, m1
psrlw m2, m0, 8
pand m0, m5
paddd m4, m2
add r6, mmsize
jl .loopx
+%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
+ jz .no_overread
+ pcmpeqb xm1, xm1
+ pandn m0, m1, m0 ; zero the lower half
+ pandn m2, m1, m2
+ psubd m3, m0
+ psubd m4, m2
+.no_overread:
+%endif
add r0, r1
add r2, r3
dec r5d
jg .loopy
mov r3, r6m
mov r4, r7m
- mova m5, [sq_0f]
HADDD m3, m0
HADDD m4, m0
- pand m3, m5
- pand m4, m5
- movq [r3], m3
- movq [r4], m4
+ pxor xm0, xm0
+ punpckldq xm3, xm0
+ punpckldq xm4, xm0
+ movq [r3], xm3
+ movq [r4], xm4
RET
%endmacro ; SSD_NV12
%endif ; !HIGH_BIT_DEPTH
SSD_NV12
INIT_XMM avx
SSD_NV12
+INIT_YMM avx2
+SSD_NV12
;=============================================================================
; variance
%macro VAR_START 1
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
-%else
+%elif mmsize < 32
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro
%macro VAR_END 2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
%else ; !HIGH_BIT_DEPTH
HADDW m5, m2
%endif ; HIGH_BIT_DEPTH
- movd eax, m5
HADDD m6, m1
+%if ARCH_X86_64
+ punpckldq m5, m6
+ movq rax, m5
+%else
+ movd eax, m5
movd edx, m6
-%ifdef ARCH_X86_64
- shl rdx, 32
- add rax, rdx
%endif
RET
%endmacro
%macro VAR_2ROW 2
mov r2d, %2
.loop:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+%1]
%else
add r0, r1
%endif
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
punpcklbw m3, m7
punpckhbw m4, m7
%endif ; !HIGH_BIT_DEPTH
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_var_wxh( uint8_t *, int )
+; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
VAR_2ROW r1, 4
VAR_END 8, 8
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
VAR
INIT_XMM xop
VAR
+
+INIT_YMM avx2
+cglobal pixel_var_16x16, 2,4,7
+ VAR_START 0
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ pmovzxbw m0, [r0]
+ pmovzxbw m3, [r0+r1]
+ pmovzxbw m1, [r0+r1*2]
+ pmovzxbw m4, [r0+r3]
+ lea r0, [r0+r1*4]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ vextracti128 xm0, m5, 1
+ vextracti128 xm1, m6, 1
+ paddw xm5, xm0
+ paddd xm6, xm1
+ HADDW xm5, xm2
+ HADDD xm6, xm1
+%if ARCH_X86_64
+ punpckldq xm5, xm6
+ movq rax, xm5
+%else
+ movd eax, xm5
+ movd edx, xm6
+%endif
+ RET
%endif ; !HIGH_BIT_DEPTH
-%macro VAR2_END 1
- HADDW m5, m7
- movd r1d, m5
+%macro VAR2_END 3
+ HADDW %2, xm1
+ movd r1d, %2
imul r1d, r1d
- HADDD m6, m1
+ HADDD %3, xm1
shr r1d, %1
- movd eax, m6
- mov [r4], eax
+ movd eax, %3
+ movd [r4], %3
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
+; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
;-----------------------------------------------------------------------------
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
VAR_START 0
mov r5d, %1
.loop:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
psubw m0, [r2]
add r2, r3
dec r5d
jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
VAR2_8x8_MMX 8, 6
VAR2_8x8_MMX 16, 7
VAR_START 1
mov r5d, %1/2
.loop:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1*2]
mova m2, [r2]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
%endmacro
INIT_XMM sse2
VAR2_8x8_SSE2 8, 6
VAR2_8x8_SSE2 16, 7
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
lea r2, [r2+r3*2]
dec r5d
jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
%endmacro
INIT_XMM ssse3
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
+%macro VAR2_8x8_AVX2 2
+cglobal pixel_var2_8x%1, 5,6,6
+ pxor m3, m3 ; sum
+ pxor m4, m4 ; sum squared
+ mova m5, [hsub_mul]
+ mov r5d, %1/4
+.loop:
+ movq xm0, [r0]
+ movq xm1, [r2]
+ vinserti128 m0, m0, [r0+r1], 1
+ vinserti128 m1, m1, [r2+r3], 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m0, m1
+ movq xm1, [r0]
+ movq xm2, [r2]
+ vinserti128 m1, m1, [r0+r1], 1
+ vinserti128 m2, m2, [r2+r3], 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ paddw m3, m0
+ paddw m3, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m4, m0
+ paddd m4, m1
+ dec r5d
+ jg .loop
+ vextracti128 xm0, m3, 1
+ vextracti128 xm1, m4, 1
+ paddw xm3, xm0
+ paddd xm4, xm1
+ VAR2_END %2, xm3, xm4
+%endmacro
+
+INIT_YMM avx2
+VAR2_8x8_AVX2 8, 6
+VAR2_8x8_AVX2 16, 7
+
%endif ; !HIGH_BIT_DEPTH
;=============================================================================
%if cpuflag(sse4)
; just use shufps on anything post conroe
shufps %1, %2, 0
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
; join 2x 32 bit and duplicate them
; emulating shufps is faster on conroe
punpcklqdq %1, %2
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro
-%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
+%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
%endmacro
+%macro LOAD_SUMSUB_16x2P_AVX2 9
+; 2*dst, 2*tmp, mul, 4*ptr
+ vbroadcasti128 m%1, [%6]
+ vbroadcasti128 m%3, [%7]
+ vbroadcasti128 m%2, [%8]
+ vbroadcasti128 m%4, [%9]
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
+ LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
+ mova xm%3, %6
+ mova xm%4, %8
+ mova xm%1, %5
+ mova xm%2, %7
+ vpermq m%3, m%3, q0011
+ vpermq m%4, m%4, q0011
+ vpermq m%1, m%1, q0011
+ vpermq m%2, m%2, q0011
+%endmacro
+
+%macro LOAD_SUMSUB8_16x2P_AVX2 9
+; 2*dst, 2*tmp, mul, 4*ptr
+ LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
+ LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
- %xdefine %%n n%1
+ %xdefine %%n nn%1
%assign offset %2*SIZEOF_PIXEL
LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
SWAP %%n, 4
%endmacro
+; in: %1 = horizontal if 0, vertical if 1
%macro SATD_8x4_SSE 8-9
-%ifidn %1, sse2
+%if %1
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
%else
HADAMARD4_V %2, %3, %4, %5, %6
%else
SWAP %8, %2
%endif
-%ifidn %1, sse2
+%if %1
paddw m%8, m%4
%else
HADAMARD 1, max, %3, %5, %6, %7
%endmacro
%macro SATD_END_MMX 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
movd eax, m0
%else ; !HIGH_BIT_DEPTH
; for small blocks on x86_32, modify pixel pointer instead.
;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_satd_16x4_internal
paddw m0, m1
ret
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
cglobal pixel_satd_%1x%2, 4,7
SATD_START_MMX
SATD_MxN_MMX 8, 16, 8
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
cglobal pixel_satd_16x16, 4,6
SATD_START_MMX
pxor m0, m0
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
-%macro SATD_START_SSE2 2
-%if cpuflag(ssse3)
+%macro SATD_START_SSE2 2-3 0
+ FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH && %3
+ pxor %2, %2
+%elif cpuflag(ssse3) && notcpuflag(atom)
+%if mmsize==32
+ mova %2, [hmul_16p]
+%else
mova %2, [hmul_8p]
+%endif
%endif
lea r4, [3*r1]
lea r5, [3*r3]
pxor %1, %1
%endmacro
-%macro SATD_END_SSE2 1
- HADDW %1, m7
+%macro SATD_END_SSE2 1-2
+%if HIGH_BIT_DEPTH
+ HADDUW %1, xm0
+%if %0 == 2
+ paddd %1, %2
+%endif
+%else
+ HADDW %1, xm7
+%endif
movd eax, %1
RET
%endmacro
+%macro SATD_ACCUM 3
+%if HIGH_BIT_DEPTH
+ HADDUW %1, %2
+ paddd %3, %1
+ pxor %1, %1
+%endif
+%endmacro
+
%macro BACKUP_POINTERS 0
-%ifdef ARCH_X86_64
- mov r10, r0
- mov r11, r2
+%if ARCH_X86_64
+%if WIN64
+ PUSH r7
+%endif
+ mov r6, r0
+ mov r7, r2
%endif
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
-%ifdef ARCH_X86_64
- lea r0, [r10+8]
- lea r2, [r11+8]
+%if ARCH_X86_64
+ lea r0, [r6+8*SIZEOF_PIXEL]
+ lea r2, [r7+8*SIZEOF_PIXEL]
+%if WIN64
+ POP r7
+%endif
%else
mov r0, r0mp
mov r2, r2mp
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
%endif
%endmacro
-%macro SATD_4x8_SSE 2
+%macro SATD_4x8_SSE 3
+%if HIGH_BIT_DEPTH
+ movh m0, [r0+0*r1]
+ movh m4, [r2+0*r3]
+ movh m1, [r0+1*r1]
+ movh m5, [r2+1*r3]
+ movhps m0, [r0+4*r1]
+ movhps m4, [r2+4*r3]
+ movh m2, [r0+2*r1]
+ movh m6, [r2+2*r3]
+ psubw m0, m4
+ movh m3, [r0+r4]
+ movh m4, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ movhps m1, [r0+1*r1]
+ movhps m5, [r2+1*r3]
+ movhps m2, [r0+2*r1]
+ movhps m6, [r2+2*r3]
+ psubw m1, m5
+ movhps m3, [r0+r4]
+ movhps m4, [r2+r5]
+ psubw m2, m6
+ psubw m3, m4
+%else ; !HIGH_BIT_DEPTH
movd m4, [r2]
movd m5, [r2+r3]
movd m6, [r2+2*r3]
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
mova m3, [hmul_4p]
DIFFOP 0, 4, 1, 5, 3
%else
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
mova m4, [hmul_4p]
DIFFOP 2, 6, 3, 5, 4
%else
DIFFOP 2, 6, 3, 5, 7
%endif
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
+%endif ; HIGH_BIT_DEPTH
+ SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
+; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
-%if cpuflag(ssse3)
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
+
+%if vertical==0 || HIGH_BIT_DEPTH
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
-%if cpuflag(ssse3)
+%if vertical==0
mova m7, [hmul_4p]
%endif
- SATD_4x8_SSE 0, swap
+ SATD_4x8_SSE vertical, 0, swap
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_4x16, 4, 6, 8
SATD_START_MMX
-%if cpuflag(ssse3)
+%if vertical==0
mova m7, [hmul_4p]
%endif
- SATD_4x8_SSE 0, swap
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- SATD_4x8_SSE 1, add
+ SATD_4x8_SSE vertical, 0, swap
+ lea r0, [r0+r1*2*SIZEOF_PIXEL]
+ lea r2, [r2+r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_8x8_internal
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
%%pixel_satd_8x4_internal:
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
ret
-%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
+; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
+%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
- ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2?
- SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
- SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
+ ; always use horizontal mode here
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
+ SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
ret
cglobal pixel_satd_16x8, 4,6,12
SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
mova m7, [pw_00ff]
%endif
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x16, 4,6,12
SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal
SATD_END_SSE2 m6
cglobal pixel_satd_16x16, 4,6,8
- SATD_START_SSE2 m6, m7
+ SATD_START_SSE2 m6, m7, 1
BACKUP_POINTERS
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ SATD_END_SSE2 m6, m7
%endif
cglobal pixel_satd_8x16, 4,6,8
%endmacro ; SATDS_SSE2
%macro SA8D_INTER 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define lh m10
%define rh m0
%else
%define lh m0
%define rh [esp+48]
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
paddd lh, rh
%else
%endmacro
%macro SA8D 0
-%ifdef HIGH_BIT_DEPTH
- %define vertical 1
-%else ; sse2 doesn't seem to like the horizontal way of doing things
- %define vertical (cpuflags == cpuflags_sse2)
-%endif
+; sse2 doesn't seem to like the horizontal way of doing things
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
- lea r10, [r0+4*r1]
- lea r11, [r2+4*r3]
+ lea r6, [r0+4*r1]
+ lea r7, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
- LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
+ LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
SAVE_MM_PERMUTATION
ret
-cglobal pixel_sa8d_8x8, 4,6,12
+cglobal pixel_sa8d_8x8, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
mova m7, [hmul_8p]
%endif
call pixel_sa8d_8x8_internal
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
shr eax, 1
RET
-cglobal pixel_sa8d_16x16, 4,6,12
+cglobal pixel_sa8d_16x16, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
call pixel_sa8d_8x8_internal ; pix[0]
add r2, 8*SIZEOF_PIXEL
add r0, 8*SIZEOF_PIXEL
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova m10, m0
call pixel_sa8d_8x8_internal ; pix[8*stride]
SA8D_INTER
SWAP 0, 10
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd eax, m0
lea r4, [3*r1]
lea r5, [3*r3]
call pixel_sa8d_8x8_internal
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
%endif
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
SA8D_INTER
%else ; !HIGH_BIT_DEPTH
paddusw m0, [esp+64-mmsize]
%endif ; !ARCH_X86_64
%endmacro ; SA8D
+;=============================================================================
+; SA8D_SATD
+;=============================================================================
+
+; %1: vertical/horizontal mode
+; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
+; m10: satd result
+; m6, m11-15: tmp regs
+%macro SA8D_SATD_8x4 5
+%if %1
+ LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ HADAMARD 0, sumsub, %2, %3, 6
+ HADAMARD 0, sumsub, %4, %5, 6
+ SBUTTERFLY wd, %2, %3, 6
+ SBUTTERFLY wd, %4, %5, 6
+ HADAMARD2_2D %2, %4, %3, %5, 6, dq
+
+ mova m12, m%2
+ mova m13, m%3
+ mova m14, m%4
+ mova m15, m%5
+ HADAMARD 0, sumsub, %2, %3, 6
+ HADAMARD 0, sumsub, %4, %5, 6
+ SBUTTERFLY qdq, 12, 13, 6
+ HADAMARD 0, amax, 12, 13, 6
+ SBUTTERFLY qdq, 14, 15, 6
+ paddw m10, m12
+ HADAMARD 0, amax, 14, 15, 6
+ paddw m10, m14
+%else
+ LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ HADAMARD4_V %2, %3, %4, %5, 6
+
+ pabsw m12, m%2 ; doing the abs first is a slight advantage
+ pabsw m14, m%4
+ pabsw m13, m%3
+ pabsw m15, m%5
+ HADAMARD 1, max, 12, 14, 6, 11
+ paddw m10, m12
+ HADAMARD 1, max, 13, 15, 6, 11
+ paddw m10, m13
+%endif
+%endmacro ; SA8D_SATD_8x4
+
+; %1: add spilled regs?
+; %2: spill regs?
+%macro SA8D_SATD_ACCUM 2
+%if HIGH_BIT_DEPTH
+ pmaddwd m10, [pw_1]
+ HADDUWD m0, m1
+%if %1
+ paddd m10, temp1
+ paddd m0, temp0
+%endif
+%if %2
+ mova temp1, m10
+ pxor m10, m10
+%endif
+%elif %1
+ paddw m0, temp0
+%endif
+%if %2
+ mova temp0, m0
+%endif
+%endmacro
+
+%macro SA8D_SATD 0
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
+cglobal pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_8x4 vertical, 0, 1, 2, 3
+ SA8D_SATD_8x4 vertical, 4, 5, 8, 9
+
+%if vertical ; sse2-style
+ HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
+ HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
+%else ; complete sa8d
+ SUMSUB_BADC w, 0, 4, 1, 5, 12
+ HADAMARD 2, sumsub, 0, 4, 12, 11
+ HADAMARD 2, sumsub, 1, 5, 12, 11
+ SUMSUB_BADC w, 2, 8, 3, 9, 12
+ HADAMARD 2, sumsub, 2, 8, 12, 11
+ HADAMARD 2, sumsub, 3, 9, 12, 11
+ HADAMARD 1, amax, 0, 4, 12, 11
+ HADAMARD 1, amax, 1, 5, 12, 4
+ HADAMARD 1, amax, 2, 8, 12, 4
+ HADAMARD 1, amax, 3, 9, 12, 4
+%endif
+
+ ; create sa8d sub results
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+
+ SAVE_MM_PERMUTATION
+ ret
+
+;-------------------------------------------------------------------------------
+; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
+;-------------------------------------------------------------------------------
+cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
+ %define temp0 [rsp+0*mmsize]
+ %define temp1 [rsp+1*mmsize]
+ FIX_STRIDES r1, r3
+%if vertical==0
+ mova m7, [hmul_8p]
+%endif
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ pxor m10, m10
+
+%if mmsize==32
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 0, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 0
+ vextracti128 xm1, m0, 1
+ vextracti128 xm2, m10, 1
+ paddw xm0, xm1
+ paddw xm10, xm2
+%else
+ lea r6, [r2+8*SIZEOF_PIXEL]
+ lea r7, [r0+8*SIZEOF_PIXEL]
+
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 0, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 1
+
+ mov r0, r7
+ mov r2, r6
+
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 0
+%endif
+
+; xop already has fast horizontal sums
+%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
+ pmaddwd xm10, [pw_1]
+ HADDUWD xm0, xm1
+ phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
+ pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
+ paddd xm0, xm1 ; sa8d sa8d satd satd
+ movd r0d, xm0
+ pextrd eax, xm0, 2
+%else
+%if HIGH_BIT_DEPTH
+ HADDD xm0, xm1
+ HADDD xm10, xm2
+%else
+ HADDUW xm0, xm1
+ HADDW xm10, xm2
+%endif
+ movd r0d, xm0
+ movd eax, xm10
+%endif
+ add r0d, 1
+ shl rax, 32
+ shr r0d, 1
+ or rax, r0
+ RET
+%endmacro ; SA8D_SATD
+
;=============================================================================
; INTRA SATD
;=============================================================================
; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
; and are only retained for old cpus.
%macro INTRA_SA8D_SSE2 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
+%if HIGH_BIT_DEPTH
+ mova m0, [r0+0*FENC_STRIDEB]
+ mova m1, [r0+1*FENC_STRIDEB]
+ mova m2, [r0+2*FENC_STRIDEB]
+ mova m3, [r0+3*FENC_STRIDEB]
+%else
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
movd m1, [r0+1*FENC_STRIDE]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
+%endif
HADAMARD4_2D 0, 1, 2, 3, 4
SAVE_MM_PERMUTATION
ret
%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
%ifidn %1, top
- movd %3, [r1+%2-FDEC_STRIDE]
+%if HIGH_BIT_DEPTH
+ mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
+%else
+ movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
pxor %5, %5
punpcklbw %3, %5
+%endif
%else ; left
%ifnidn %2, 0
- shl %2d, 5 ; log(FDEC_STRIDE)
+ shl %2d, 5 ; log(FDEC_STRIDEB)
%endif
- movd %3, [r1+%2-4+1*FDEC_STRIDE]
- pinsrw %3, [r1+%2-2+0*FDEC_STRIDE], 0
- pinsrw %3, [r1+%2-2+2*FDEC_STRIDE], 2
- pinsrw %3, [r1+%2-2+3*FDEC_STRIDE], 3
+ movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
+ pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
+%if HIGH_BIT_DEPTH == 0
psrlw %3, 8
+%endif
%ifnidn %2, 0
shr %2d, 5
%endif
%8 %3, %6
%endmacro
-%macro CLEAR_SUMS 0
-%ifdef ARCH_X86_64
- mov qword [sums+0], 0
- mov qword [sums+8], 0
- mov qword [sums+16], 0
-%else
- pxor m7, m7
- movq [sums+0], m7
- movq [sums+8], m7
- movq [sums+16], m7
-%endif
-%endmacro
-
; in: m1..m3
; out: m7
; clobber: m4..m6
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_4x4, 3,3
-%ifdef ARCH_X86_64
+%if UNIX64
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
%define left_1d rsp-16 ; size 8
%else
- ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
- SUB esp, 16
- %define top_1d esp+8
- %define left_1d esp
+ ; WIN64: stack is 16 byte aligned because abi says so
+ ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
+ SUB rsp, 16
+ %define top_1d rsp+8
+ %define left_1d rsp
%endif
call hadamard_load
movd [r2+0], m0 ; i4x4_v satd
movd [r2+4], m4 ; i4x4_h satd
movd [r2+8], m5 ; i4x4_dc satd
-%ifndef ARCH_X86_64
- ADD esp, 16
+%if UNIX64 == 0
+ ADD rsp, 16
%endif
RET
-%ifdef ARCH_X86_64
- %define t0 r10
- %define t2 r11
-%else
- %define t0 r0
- %define t2 r2
-%endif
-
;-----------------------------------------------------------------------------
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_16x16, 0,5
- %assign stack_pad 88 + ((stack_offset+88+gprsize)&15)
+ %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, stack_pad
-%define sums rsp+64 ; size 24
+%define sums rsp+64 ; size 56
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
movifnidn r1, r1mp
- CLEAR_SUMS
+
+ pxor m7, m7
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
+%if HIGH_BIT_DEPTH
+ mova [sums+24], m7
+ mova [sums+32], m7
+ mova [sums+40], m7
+ mova [sums+48], m7
+%endif
; 1D hadamards
- mov t0d, 12
- movd m6, [pw_64]
+ mov r3d, 12
+ movd m6, [pw_32]
.loop_edge:
- SCALAR_HADAMARD left, t0, m0, m1
- SCALAR_HADAMARD top, t0, m1, m2, m3
- paddw m6, m0
- paddw m6, m1
- sub t0d, 4
+ SCALAR_HADAMARD left, r3, m0, m1
+ SCALAR_HADAMARD top, r3, m1, m2, m3
+ pavgw m0, m1
+ paddw m6, m0
+ sub r3d, 4
jge .loop_edge
- psrlw m6, 3
- pand m6, [sw_f0] ; dc
+ psrlw m6, 2
+ pand m6, [sw_f0] ; dc
; 2D hadamards
- movifnidn r0, r0mp
- mov r3, -4
+ movifnidn r0, r0mp
+ mov r3, -4
.loop_y:
- mov r4, -4
+ mov r4, -4
.loop_x:
call hadamard_load
SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
pavgw m4, m7
pavgw m5, m7
- paddw m0, [sums+0] ; i16x16_v satd
- paddw m4, [sums+8] ; i16x16_h satd
+ paddw m0, [sums+ 0] ; i16x16_v satd
+ paddw m4, [sums+ 8] ; i16x16_h satd
paddw m5, [sums+16] ; i16x16_dc satd
- movq [sums+0], m0
- movq [sums+8], m4
- movq [sums+16], m5
+ mova [sums+ 0], m0
+ mova [sums+ 8], m4
+ mova [sums+16], m5
- add r0, 4
+ add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
- add r0, 4*FENC_STRIDE-16
+%if HIGH_BIT_DEPTH
+ psrld m7, m4, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m4, m7
+ psrld m7, m0, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m0, m7
+ paddd m4, [sums+32]
+ paddd m0, [sums+24]
+ mova [sums+32], m4
+ mova [sums+24], m0
+ pxor m7, m7
+ punpckhwd m3, m5, m7
+ punpcklwd m5, m7
+ paddd m3, [sums+48]
+ paddd m5, [sums+40]
+ mova [sums+48], m3
+ mova [sums+40], m5
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
+%endif
+ add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
inc r3
jl .loop_y
; horizontal sum
movifnidn r2, r2mp
- movq m2, [sums+16]
- movq m1, [sums+8]
- movq m0, [sums+0]
- movq m7, m2
- SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+%if HIGH_BIT_DEPTH
+ mova m1, m5
+ paddd m5, m3
+ HADDD m5, m7 ; DC satd
+ HADDD m4, m7 ; H satd
+ HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
+ psrld m0, 1
+ psrlq m1, 32 ; DC[1]
+ paddd m0, m3 ; DC[2]
+ psrlq m3, 32 ; DC[3]
+ paddd m0, m1
+ paddd m0, m3
+%else
+ mova m7, m5
+ SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
psrld m0, 1
pslld m7, 16
psrld m7, 16
- paddd m0, m2
+ paddd m0, m5
psubd m0, m7
- movd [r2+8], m2 ; i16x16_dc satd
- movd [r2+4], m1 ; i16x16_h satd
- movd [r2+0], m0 ; i16x16_v satd
- ADD rsp, stack_pad
+%endif
+ movd [r2+8], m5 ; i16x16_dc satd
+ movd [r2+4], m4 ; i16x16_h satd
+ movd [r2+0], m0 ; i16x16_v satd
+ ADD rsp, stack_pad
RET
+%if ARCH_X86_64
+ %define t0 r6
+%else
+ %define t0 r2
+%endif
+
;-----------------------------------------------------------------------------
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
movifnidn r1, r1mp
- CLEAR_SUMS
+ pxor m7, m7
+ mova [sums+ 0], m7
+ mova [sums+ 8], m7
+ mova [sums+16], m7
; 1D hadamards
- mov t0d, 4
+ mov r3d, 4
.loop_edge:
- SCALAR_HADAMARD left, t0, m0, m1
- SCALAR_HADAMARD top, t0, m0, m1, m2
- sub t0d, 4
+ SCALAR_HADAMARD left, r3, m0, m1
+ SCALAR_HADAMARD top, r3, m0, m1, m2
+ sub r3d, 4
jge .loop_edge
; dc
- movzx t2d, word [left_1d+0]
+ movzx t0d, word [left_1d+0]
movzx r3d, word [top_1d+0]
movzx r4d, word [left_1d+8]
movzx r5d, word [top_1d+8]
- lea t2d, [t2 + r3 + 16]
+ lea t0d, [t0 + r3 + 16]
lea r3d, [r4 + r5 + 16]
- shr t2d, 1
+ shr t0d, 1
shr r3d, 1
add r4d, 8
add r5d, 8
- and t2d, -16 ; tl
+ and t0d, -16 ; tl
and r3d, -16 ; br
and r4d, -16 ; bl
and r5d, -16 ; tr
- mov [dc_1d+ 0], t2d ; tl
+ mov [dc_1d+ 0], t0d ; tl
mov [dc_1d+ 4], r5d ; tr
mov [dc_1d+ 8], r4d ; bl
mov [dc_1d+12], r3d ; br
movq [sums+8], m4
movq [sums+0], m5
- add r0, 4
+ add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
- add r0, 4*FENC_STRIDE-8
+ add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
add r5, 8
inc r3
jl .loop_y
movq m1, [sums+8]
movq m2, [sums+16]
movq m7, m0
+%if HIGH_BIT_DEPTH
+ psrlq m7, 16
+ HADDW m7, m3
+ SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
+ psrld m2, 1
+ paddd m2, m7
+%else
psrlq m7, 15
paddw m2, m7
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m2, 1
+%endif
movd [r2+0], m0 ; i8x8c_dc satd
movd [r2+4], m1 ; i8x8c_h satd
movd [r2+8], m2 ; i8x8c_v satd
%assign pad 0xc0-gprsize-(stack_offset&15)
%define pred_buf rsp
sub rsp, pad
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
INTRA_X9_PRED intrax9a, m8
%else
INTRA_X9_PRED intrax9a, [rsp+0xa0]
paddd m2, m3
paddd m4, m5
paddd m6, m7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
SWAP 7, 8
pxor m8, m8
%define %%zero m8
RET
%endif ; cpuflag
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
psubw m1, m9
psubw m2, m10
psubw m3, m11
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
pshufd m1, m0, q0032
psubw m2, [fenc_buf+0x20]
.satd_8x4b:
psubw m3, [fenc_buf+0x30]
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
pshufd m1, m0, q0032
%define fenc13 m5
%define fenc46 m6
%define fenc57 m7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define tmp m8
%assign padbase 0x0
%else
ADD rsp, pad
RET
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
;-----------------------------------------------------------------------------
; out: [tmp]=hadamard4, m0=satd
INIT_MMX mmx2
cglobal hadamard_ac_4x4
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1]
mova m2, [r0+r1*2]
ABSW2 m1, m3, m1, m3, m4, m5
HADAMARD 0, max, 0, 2, 4, 5
HADAMARD 0, max, 1, 3, 4, 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmaddwd m0, m7
pmaddwd m1, m7
paddd m6, m0
ret
%macro AC_PREP 2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmaddwd %1, %2
%endif
%endmacro
%macro AC_PADD 3
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
AC_PREP %2, %3
paddd %1, %2
%else
cglobal hadamard_ac_8x8
mova m6, [mask_ac4]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m7, [pw_1]
%else
pxor m7, m7
AC_PADD m5, m0, m7
sub r3, 40
mova [rsp+gprsize+8], m5 ; save satd
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pxor m6, m6
%endif
%rep 3
ABSW2 m1, m3, m1, m3, m4, m5
ABSW2 m0, m2, m0, m2, m4, m5
HADAMARD 0, max, 1, 3, 4, 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pand m0, [mask_ac4]
pmaddwd m1, m7
pmaddwd m0, m7
%macro HADAMARD_AC_WXH_SUM_MMX 2
mova m1, [rsp+1*mmsize]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if %1*%2 >= 128
paddd m0, [rsp+2*mmsize]
paddd m1, [rsp+3*mmsize]
movd edx, m0
movd eax, m1
shr edx, 1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
HADAMARD_AC_WXH_MMX 8, 8
%macro LOAD_INC_8x4W_SSE2 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movu m%1, [r0]
movu m%2, [r0+r1]
movu m%3, [r0+r1*2]
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
cglobal hadamard_ac_8x8
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define spill0 m8
%define spill1 m9
%define spill2 m10
%else
%define spill0 [rsp+gprsize]
- %define spill1 [rsp+gprsize+16]
- %define spill2 [rsp+gprsize+32]
+ %define spill1 [rsp+gprsize+mmsize]
+ %define spill2 [rsp+gprsize+mmsize*2]
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%define vertical 1
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
%define vertical 0
;LOAD_INC loads sumsubs
mova m7, [hmul_8p]
AC_PADD m1, m2, [pw_1]
ABSW m2, m7, m7
AC_PADD m1, m3, [pw_1]
- mova m3, m7
AC_PADD m1, m2, [pw_1]
- mova m2, m6
+ paddw m3, m7, spill2
psubw m7, spill2
- paddw m3, spill2
- mova [rsp+gprsize+32], m1 ; save satd
- mova m1, m5
+ mova [rsp+gprsize+mmsize*2], m1 ; save satd
+ paddw m2, m6, spill1
psubw m6, spill1
- paddw m2, spill1
+ paddw m1, m5, spill0
psubw m5, spill0
- paddw m1, spill0
%assign %%x 2
%if vertical
%assign %%x 4
AC_PREP m2, [pw_1]
AC_PADD m2, m3, [pw_1]
AC_PADD m2, m1, [pw_1]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
paddd m2, m2
%else
paddw m2, m2
ABSW m0, m0, m7
AC_PADD m2, m4, [pw_1]
AC_PADD m2, m0, [pw_1]
- mova [rsp+gprsize+16], m2 ; save sa8d
+ mova [rsp+gprsize+mmsize], m2 ; save sa8d
SWAP 0, 2
SAVE_MM_PERMUTATION
ret
HADAMARD_AC_WXH_SSE2 16, 16
-HADAMARD_AC_WXH_SSE2 8, 16
HADAMARD_AC_WXH_SSE2 16, 8
+%if mmsize <= 16
+HADAMARD_AC_WXH_SSE2 8, 16
HADAMARD_AC_WXH_SSE2 8, 8
+%endif
%endmacro ; HADAMARD_AC_SSE2
%macro HADAMARD_AC_WXH_SUM_SSE2 2
mova m1, [rsp+2*mmsize]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if %1*%2 >= 128
paddd m0, [rsp+3*mmsize]
paddd m1, [rsp+4*mmsize]
paddd m1, [rsp+8*mmsize]
psrld m0, 1
%endif
- HADDD m0, m2
- HADDD m1, m3
+ HADDD xm0, xm2
+ HADDD xm1, xm3
%else ; !HIGH_BIT_DEPTH
-%if %1*%2 >= 128
+%if %1*%2*16/mmsize >= 128
paddusw m0, [rsp+3*mmsize]
paddusw m1, [rsp+4*mmsize]
%endif
-%if %1*%2 == 256
+%if %1*%2*16/mmsize == 256
paddusw m0, [rsp+5*mmsize]
paddusw m1, [rsp+6*mmsize]
paddusw m0, [rsp+7*mmsize]
paddusw m1, [rsp+8*mmsize]
psrlw m0, 1
%endif
- HADDUW m0, m2
- HADDW m1, m3
+%if mmsize==32
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ paddusw xm0, xm2
+ paddusw xm1, xm3
+%endif
+ HADDUW xm0, xm2
+ HADDW xm1, xm3
%endif ; HIGH_BIT_DEPTH
%endmacro
; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 2
-cglobal pixel_hadamard_ac_%1x%2, 2,3,11
- %assign pad 16-gprsize-(stack_offset&15)
+cglobal pixel_hadamard_ac_%1x%2, 2,4,11
%define ysub r1
FIX_STRIDES r1
- sub rsp, 48+pad
- lea r2, [r1*3]
+ mov r3, rsp
+ and rsp, ~(mmsize-1)
+ sub rsp, mmsize*3
+ lea r2, [r1*3]
call hadamard_ac_8x8
%if %2==16
%define ysub r2
- lea r0, [r0+r1*4]
- sub rsp, 32
+ lea r0, [r0+r1*4]
+ sub rsp, mmsize*2
call hadamard_ac_8x8
%endif
-%if %1==16
+%if %1==16 && mmsize <= 16
neg ysub
- sub rsp, 32
- lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
+ sub rsp, mmsize*2
+ lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
neg ysub
call hadamard_ac_8x8
%if %2==16
- lea r0, [r0+r1*4]
- sub rsp, 32
+ lea r0, [r0+r1*4]
+ sub rsp, mmsize*2
call hadamard_ac_8x8
%endif
%endif
HADAMARD_AC_WXH_SUM_SSE2 %1, %2
- movd edx, m0
- movd eax, m1
- shr edx, 2 - (%1*%2 >> 8)
+ movd edx, xm0
+ movd eax, xm1
+ shr edx, 2 - (%1*%2*16/mmsize >> 8)
shr eax, 1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
- add rsp, 16+%1*%2/2+pad
+ mov rsp, r3
RET
%endmacro ; HADAMARD_AC_WXH_SSE2
; instantiate satds
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
cextern pixel_sa8d_8x8_internal_mmx2
INIT_MMX mmx2
SA8D
INIT_XMM sse2
SA8D
SATDS_SSE2
-%ifndef HIGH_BIT_DEPTH
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%if HIGH_BIT_DEPTH == 0
INTRA_SA8D_SSE2
+%endif
INIT_MMX mmx2
INTRA_X3_MMX
-%endif
INIT_XMM sse2
HADAMARD_AC_SSE2
+%if HIGH_BIT_DEPTH == 0
+INIT_XMM ssse3,atom
+SATDS_SSE2
+SA8D
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%endif
+
%define DIFFOP DIFF_SUMSUB_SSSE3
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
-%ifndef HIGH_BIT_DEPTH
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
%endif
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_MMX ssse3
INTRA_X3_MMX
%endif
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
-%ifndef HIGH_BIT_DEPTH
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
%endif
+; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
+; it's effectively free.
+%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
INIT_XMM avx
SATDS_SSE2
SA8D
-%ifndef HIGH_BIT_DEPTH
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
%endif
INIT_XMM xop
SATDS_SSE2
SA8D
-%ifndef HIGH_BIT_DEPTH
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
%endif
HADAMARD_AC_SSE2
+
+%if HIGH_BIT_DEPTH == 0
+%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
+%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+
+%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
+ movq xm%1, [r0]
+ movq xm%3, [r2]
+ movq xm%2, [r0+r1]
+ movq xm%4, [r2+r3]
+ vinserti128 m%1, m%1, [r0+4*r1], 1
+ vinserti128 m%3, m%3, [r2+4*r3], 1
+ vinserti128 m%2, m%2, [r0+r4], 1
+ vinserti128 m%4, m%4, [r2+r5], 1
+ punpcklqdq m%1, m%1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%2, m%2
+ punpcklqdq m%4, m%4
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movq xm%3, [r0]
+ movq xm%5, [r2]
+ movq xm%4, [r0+r1]
+ movq xm%6, [r2+r3]
+ vinserti128 m%3, m%3, [r0+4*r1], 1
+ vinserti128 m%5, m%5, [r2+4*r3], 1
+ vinserti128 m%4, m%4, [r0+r4], 1
+ vinserti128 m%6, m%6, [r2+r5], 1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%5, m%5
+ punpcklqdq m%4, m%4
+ punpcklqdq m%6, m%6
+ DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
+%endmacro
+
+%macro SATD_START_AVX2 2-3 0
+ FIX_STRIDES r1, r3
+%if %3
+ mova %2, [hmul_8p]
+ lea r4, [5*r1]
+ lea r5, [5*r3]
+%else
+ mova %2, [hmul_16p]
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+%endif
+ pxor %1, %1
+%endmacro
+
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+cglobal pixel_satd_16x8_internal
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_16x16, 4,6,8
+ SATD_START_AVX2 m6, m7
+ call pixel_satd_16x8_internal
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+pixel_satd_16x8_internal:
+ call pixel_satd_16x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_16x8, 4,6,8
+ SATD_START_AVX2 m6, m7
+ jmp pixel_satd_16x8_internal
+
+cglobal pixel_satd_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_8x16, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_sa8d_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ HADAMARD4_V 0, 1, 2, 3, 4
+ HADAMARD 8, sumsub, 0, 1, 4, 5
+ HADAMARD 8, sumsub, 2, 3, 4, 5
+ HADAMARD 2, sumsub, 0, 1, 4, 5
+ HADAMARD 2, sumsub, 2, 3, 4, 5
+ HADAMARD 1, amax, 0, 1, 4, 5
+ HADAMARD 1, amax, 2, 3, 4, 5
+ paddw m6, m0
+ paddw m6, m2
+ ret
+
+cglobal pixel_sa8d_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_sa8d_8x8_internal
+ vextracti128 xm1, m6, 1
+ paddw xm6, xm1
+ HADDW xm6, xm1
+ movd eax, xm6
+ add eax, 1
+ shr eax, 1
+ RET
+
+cglobal intra_sad_x9_8x8, 5,7,8
+ %define pred(i,j) [rsp+i*0x40+j*0x20]
+
+ mov r6, rsp
+ and rsp, ~31
+ sub rsp, 0x240
+ movu m5, [r0+0*FENC_STRIDE]
+ movu m6, [r0+4*FENC_STRIDE]
+ punpcklqdq m5, [r0+2*FENC_STRIDE]
+ punpcklqdq m6, [r0+6*FENC_STRIDE]
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+128]
+ %define off(m) (r0+m-(intra8x9_h1+128))
+
+ vpbroadcastq m0, [r2+16]
+ psadbw m4, m0, m5
+ psadbw m2, m0, m6
+ mova pred(0,0), m0
+ mova pred(0,1), m0
+ paddw m4, m2
+
+ vpbroadcastq m1, [r2+7]
+ pshufb m3, m1, [off(intra8x9_h1)]
+ pshufb m2, m1, [off(intra8x9_h3)]
+ mova pred(1,0), m3
+ mova pred(1,1), m2
+ psadbw m3, m5
+ psadbw m2, m6
+ paddw m3, m2
+
+ lea r5, [rsp+0x100]
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
+
+ ; combine the first two
+ pslldq m3, 2
+ por m4, m3
+
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ pshufb m0, m2
+ mova pred(2,0), m0
+ mova pred(2,1), m0
+ psadbw m3, m0, m5
+ psadbw m2, m0, m6
+ paddw m3, m2
+
+ pslldq m3, 4
+ por m4, m3
+
+ vbroadcasti128 m0, [r2+16]
+ vbroadcasti128 m2, [r2+17]
+ pslldq m1, m0, 1
+ pavgb m3, m0, m2
+ PRED4x4_LOWPASS m0, m1, m2, m0, m7
+ pshufb m1, m0, [off(intra8x9_ddl1)]
+ pshufb m2, m0, [off(intra8x9_ddl3)]
+ mova pred(3,0), m1
+ mova pred(3,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova [r4], xm4
+
+ ; for later
+ vinserti128 m7, m3, xm0, 1
+
+ vbroadcasti128 m2, [r2+8]
+ vbroadcasti128 m0, [r2+7]
+ vbroadcasti128 m1, [r2+6]
+ pavgb m3, m2, m0
+ PRED4x4_LOWPASS m0, m1, m2, m0, m4
+ pshufb m1, m0, [off(intra8x9_ddr1)]
+ pshufb m2, m0, [off(intra8x9_ddr3)]
+ mova pred(4,0), m1
+ mova pred(4,1), m2
+ psadbw m4, m1, m5
+ psadbw m2, m6
+ paddw m4, m2
+
+ add r0, 256
+ add r5, 0xC0
+ %define off(m) (r0+m-(intra8x9_h1+256+128))
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
+
+ vpblendd m2, m3, m0, 11110011b
+ pshufb m1, m2, [off(intra8x9_vr1)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ mova pred(5,0), m1
+ mova pred(5,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 2
+ por m4, m1
+
+ psrldq m2, m3, 4
+ pblendw m2, m0, q3330
+ punpcklbw m0, m3
+ pshufb m1, m2, [off(intra8x9_hd1)]
+ pshufb m2, m0, [off(intra8x9_hd3)]
+ mova pred(6,0), m1
+ mova pred(6,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 4
+ por m4, m1
+
+ pshufb m1, m7, [off(intra8x9_vl1)]
+ pshufb m2, m7, [off(intra8x9_vl3)]
+ mova pred(7,0), m1
+ mova pred(7,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova xm3, [r4]
+ SBUTTERFLY qdq, 3, 4, 7
+ paddw xm3, xm4
+
+ pslldq m1, m0, 1
+ vpbroadcastd m0, [r2+7]
+ palignr m0, m1, 1
+ pshufb m1, m0, [off(intra8x9_hu1)]
+ pshufb m2, m0, [off(intra8x9_hu3)]
+ mova pred(8,0), m1
+ mova pred(8,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2
+ movhlps xm2, xm1
+ paddw xm1, xm2
+ movd r2d, xm1
+
+ paddw xm3, [r3]
+ mova [r4], xm3
+ add r2w, word [r3+16]
+ mov [r4+16], r2w
+
+ phminposuw xm3, xm3
+ movd r3d, xm3
+ add r2d, 8<<16
+ cmp r3w, r2w
+ cmovg r3d, r2d
+
+ mov r2d, r3d
+ shr r3, 16
+ shl r3, 6
+ add r1, 4*FDEC_STRIDE
+ mova xm0, [rsp+r3+0x00]
+ mova xm1, [rsp+r3+0x10]
+ mova xm2, [rsp+r3+0x20]
+ mova xm3, [rsp+r3+0x30]
+ movq [r1+FDEC_STRIDE*-4], xm0
+ movhps [r1+FDEC_STRIDE*-2], xm0
+ movq [r1+FDEC_STRIDE*-3], xm1
+ movhps [r1+FDEC_STRIDE*-1], xm1
+ movq [r1+FDEC_STRIDE* 0], xm2
+ movhps [r1+FDEC_STRIDE* 2], xm2
+ movq [r1+FDEC_STRIDE* 1], xm3
+ movhps [r1+FDEC_STRIDE* 3], xm3
+ mov rsp, r6
+ mov eax, r2d
+ RET
+%endif ; HIGH_BIT_DEPTH
+
;=============================================================================
; SSIM
;=============================================================================
;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
+; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movdqu m5, [r0+(%1&1)*r1]
movdqu m6, [r2+(%1&1)*r3]
%else
pmaddwd m7, m5, m6
pmaddwd m5, m5
pmaddwd m6, m6
-%if %1==0
- SWAP 3, 5
- SWAP 4, 7
-%else
- paddd m3, m5
- paddd m4, m7
-%endif
+ ACCUM paddd, 3, 5, %1
+ ACCUM paddd, 4, 7, %1
paddd m3, m6
%endmacro
punpckhdq m5, m3, m4
punpckldq m3, m4
-%ifdef UNIX64
+%if UNIX64
%define t0 r4
%else
%define t0 rax
addps m0, m4
pshuflw m4, m0, q0032
addss m0, m4
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
movd r0m, m0
fld dword r0m
%endif
INIT_XMM avx
SSIM
+;-----------------------------------------------------------------------------
+; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+;-----------------------------------------------------------------------------
+%macro ASD8 0
+cglobal pixel_asd8, 5,5
+ pxor m0, m0
+ pxor m1, m1
+.loop:
+%if HIGH_BIT_DEPTH
+ paddw m0, [r0]
+ paddw m1, [r2]
+ paddw m0, [r0+2*r1]
+ paddw m1, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ paddw m0, [r0]
+ paddw m1, [r2+4*r3]
+ lea r2, [r2+4*r3]
+ paddw m0, [r0+2*r1]
+ paddw m1, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%else
+ movq m2, [r0]
+ movq m3, [r2]
+ movhps m2, [r0+r1]
+ movhps m3, [r2+r3]
+ lea r0, [r0+2*r1]
+ psadbw m2, m1
+ psadbw m3, m1
+ movq m4, [r0]
+ movq m5, [r2+2*r3]
+ lea r2, [r2+2*r3]
+ movhps m4, [r0+r1]
+ movhps m5, [r2+r3]
+ lea r0, [r0+2*r1]
+ paddw m0, m2
+ psubw m0, m3
+ psadbw m4, m1
+ psadbw m5, m1
+ lea r2, [r2+2*r3]
+ paddw m0, m4
+ psubw m0, m5
+%endif
+ sub r4d, 4
+ jg .loop
+%if HIGH_BIT_DEPTH
+ psubw m0, m1
+ HADDW m0, m1
+ ABSD m1, m0
+%else
+ movhlps m1, m0
+ paddw m0, m1
+ ABSW m1, m0
+%endif
+ movd eax, m1
+ RET
+%endmacro
+
+INIT_XMM sse2
+ASD8
+INIT_XMM ssse3
+ASD8
+%if HIGH_BIT_DEPTH
+INIT_XMM xop
+ASD8
+%endif
+
;=============================================================================
; Successive Elimination ADS
;=============================================================================
%macro ADS_START 0
-%ifdef WIN64
- movsxd r5, r5d
+%if UNIX64
+ movsxd r5, r5d
+%else
+ mov r5d, r5m
%endif
- mov r0d, r5d
- lea r6, [r4+r5+15]
- and r6, ~15;
+ mov r0d, r5d
+ lea r6, [r4+r5+(mmsize-1)]
+ and r6, ~(mmsize-1)
shl r2d, 1
%endmacro
add r1, 8*%1
add r3, 8*%1
add r6, 4*%1
- sub r0d, 4*%1
+ sub r0d, 4*%1
jg .loop
WIN64_RESTORE_XMM rsp
- jmp ads_mvs
+%if mmsize==32
+ vzeroupper
+%endif
+ lea r6, [r4+r5+(mmsize-1)]
+ and r6, ~(mmsize-1)
+%if cpuflag(ssse3)
+ jmp ads_mvs_ssse3
+%else
+ jmp ads_mvs_mmx
+%endif
%endmacro
;-----------------------------------------------------------------------------
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
-cglobal pixel_ads4, 6,7
- movq mm6, [r0]
- movq mm4, [r0+8]
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, q2222
- pshufw mm5, mm4, 0
- pshufw mm4, mm4, q2222
+cglobal pixel_ads4, 5,7
+ mova m6, [r0]
+ mova m4, [r0+8]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
+ pshufw m5, m4, 0
+ pshufw m4, m4, q2222
ADS_START
.loop:
- movq mm0, [r1]
- movq mm1, [r1+16]
- psubw mm0, mm7
- psubw mm1, mm6
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- movq mm2, [r1+r2]
- movq mm3, [r1+r2+16]
- psubw mm2, mm5
- psubw mm3, mm4
- paddw mm0, mm1
- ABSW mm2, mm2, mm1
- ABSW mm3, mm3, mm1
- paddw mm0, mm2
- paddw mm0, mm3
- pshufw mm1, r6m, 0
- paddusw mm0, [r3]
- psubusw mm1, mm0
- packsswb mm1, mm1
- movd [r6], mm1
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ pshufw m1, r6m, 0
+ paddusw m0, [r3]
+ psubusw m1, m0
+ packsswb m1, m1
+ movd [r6], m1
ADS_END 1
-cglobal pixel_ads2, 6,7
- movq mm6, [r0]
- pshufw mm5, r6m, 0
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, q2222
+cglobal pixel_ads2, 5,7
+ mova m6, [r0]
+ pshufw m5, r6m, 0
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
ADS_START
.loop:
- movq mm0, [r1]
- movq mm1, [r1+r2]
- psubw mm0, mm7
- psubw mm1, mm6
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- paddw mm0, mm1
- paddusw mm0, [r3]
- movq mm4, mm5
- psubusw mm4, mm0
- packsswb mm4, mm4
- movd [r6], mm4
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddw m0, m1
+ paddusw m0, [r3]
+ mova m4, m5
+ psubusw m4, m0
+ packsswb m4, m4
+ movd [r6], m4
ADS_END 1
-cglobal pixel_ads1, 6,7
- pshufw mm7, [r0], 0
- pshufw mm6, r6m, 0
+cglobal pixel_ads1, 5,7
+ pshufw m7, [r0], 0
+ pshufw m6, r6m, 0
ADS_START
.loop:
- movq mm0, [r1]
- movq mm1, [r1+8]
- psubw mm0, mm7
- psubw mm1, mm7
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- paddusw mm0, [r3]
- paddusw mm1, [r3+8]
- movq mm4, mm6
- movq mm5, mm6
- psubusw mm4, mm0
- psubusw mm5, mm1
- packsswb mm4, mm5
- movq [r6], mm4
+ movu m0, [r1]
+ movu m1, [r1+8]
+ psubw m0, m7
+ psubw m1, m7
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddusw m0, [r3]
+ paddusw m1, [r3+8]
+ mova m4, m6
+ mova m5, m6
+ psubusw m4, m0
+ psubusw m5, m1
+ packsswb m4, m5
+ mova [r6], m4
ADS_END 2
%macro ADS_XMM 0
-cglobal pixel_ads4, 6,7,12
- movdqa xmm4, [r0]
- pshuflw xmm7, xmm4, 0
- pshuflw xmm6, xmm4, q2222
- pshufhw xmm5, xmm4, 0
- pshufhw xmm4, xmm4, q2222
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpckhqdq xmm5, xmm5
- punpckhqdq xmm4, xmm4
-%ifdef ARCH_X86_64
- pshuflw xmm8, r6m, 0
- punpcklqdq xmm8, xmm8
+%if mmsize==32
+cglobal pixel_ads4, 5,7,8
+ vpbroadcastw m7, [r0+ 0]
+ vpbroadcastw m6, [r0+ 4]
+ vpbroadcastw m5, [r0+ 8]
+ vpbroadcastw m4, [r0+12]
+%else
+cglobal pixel_ads4, 5,7,12
+ mova m4, [r0]
+ pshuflw m7, m4, q0000
+ pshuflw m6, m4, q2222
+ pshufhw m5, m4, q0000
+ pshufhw m4, m4, q2222
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+ punpckhqdq m5, m5
+ punpckhqdq m4, m4
+%endif
+%if ARCH_X86_64 && mmsize == 16
+ movd m8, r6m
+ SPLATW m8, m8
ADS_START
- movdqu xmm10, [r1]
- movdqu xmm11, [r1+r2]
+ movu m10, [r1]
+ movu m11, [r1+r2]
.loop:
- psubw xmm0, xmm10, xmm7
- movdqu xmm10, [r1+16]
- psubw xmm1, xmm10, xmm6
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- psubw xmm2, xmm11, xmm5
- movdqu xmm11, [r1+r2+16]
- paddw xmm0, xmm1
- psubw xmm3, xmm11, xmm4
- movdqu xmm9, [r3]
- ABSW xmm2, xmm2, xmm1
- ABSW xmm3, xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- paddusw xmm0, xmm9
- psubusw xmm1, xmm8, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
+ psubw m0, m10, m7
+ movu m10, [r1+16]
+ psubw m1, m10, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ psubw m2, m11, m5
+ movu m11, [r1+r2+16]
+ paddw m0, m1
+ psubw m3, m11, m4
+ movu m9, [r3]
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddusw m0, m9
+ psubusw m1, m8, m0
%else
ADS_START
.loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- movdqu xmm2, [r1+r2]
- movdqu xmm3, [r1+r2+16]
- psubw xmm2, xmm5
- psubw xmm3, xmm4
- paddw xmm0, xmm1
- ABSW xmm2, xmm2, xmm1
- ABSW xmm3, xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- movd xmm1, r6m
- movdqu xmm2, [r3]
- pshuflw xmm1, xmm1, 0
- punpcklqdq xmm1, xmm1
- paddusw xmm0, xmm2
- psubusw xmm1, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ movu m2, [r3]
+%if mmsize==32
+ vpbroadcastw m1, r6m
+%else
+ movd m1, r6m
+ pshuflw m1, m1, 0
+ punpcklqdq m1, m1
+%endif
+ paddusw m0, m2
+ psubusw m1, m0
%endif ; ARCH
- ADS_END 2
+ packsswb m1, m1
+%if mmsize==32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/8
-cglobal pixel_ads2, 6,7,8
- movq xmm6, [r0]
- movd xmm5, r6m
- pshuflw xmm7, xmm6, 0
- pshuflw xmm6, xmm6, q2222
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpcklqdq xmm5, xmm5
+cglobal pixel_ads2, 5,7,8
+%if mmsize==32
+ vpbroadcastw m7, [r0+0]
+ vpbroadcastw m6, [r0+4]
+ vpbroadcastw m5, r6m
+%else
+ movq m6, [r0]
+ movd m5, r6m
+ pshuflw m7, m6, 0
+ pshuflw m6, m6, q2222
+ pshuflw m5, m5, 0
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+ punpcklqdq m5, m5
+%endif
ADS_START
.loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+r2]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- movdqu xmm4, [r3]
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- paddw xmm0, xmm1
- paddusw xmm0, xmm4
- psubusw xmm1, xmm5, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
- ADS_END 2
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ psubw m0, m7
+ psubw m1, m6
+ movu m4, [r3]
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddw m0, m1
+ paddusw m0, m4
+ psubusw m1, m5, m0
+ packsswb m1, m1
+%if mmsize==32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/8
-cglobal pixel_ads1, 6,7,8
- movd xmm7, [r0]
- movd xmm6, r6m
- pshuflw xmm7, xmm7, 0
- pshuflw xmm6, xmm6, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
+cglobal pixel_ads1, 5,7,8
+%if mmsize==32
+ vpbroadcastw m7, [r0]
+ vpbroadcastw m6, r6m
+%else
+ movd m7, [r0]
+ movd m6, r6m
+ pshuflw m7, m7, 0
+ pshuflw m6, m6, 0
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+%endif
ADS_START
.loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm7
- movdqu xmm2, [r3]
- movdqu xmm3, [r3+16]
- ABSW xmm0, xmm0, xmm4
- ABSW xmm1, xmm1, xmm5
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- psubusw xmm4, xmm6, xmm0
- psubusw xmm5, xmm6, xmm1
- packsswb xmm4, xmm5
- movdqa [r6], xmm4
- ADS_END 4
+ movu m0, [r1]
+ movu m1, [r1+mmsize]
+ psubw m0, m7
+ psubw m1, m7
+ movu m2, [r3]
+ movu m3, [r3+mmsize]
+ ABSW m0, m0, m4
+ ABSW m1, m1, m5
+ paddusw m0, m2
+ paddusw m1, m3
+ psubusw m4, m6, m0
+ psubusw m5, m6, m1
+ packsswb m4, m5
+%if mmsize==32
+ vpermq m4, m4, q3120
+%endif
+ mova [r6], m4
+ ADS_END mmsize/4
%endmacro
INIT_XMM sse2
ADS_XMM
INIT_XMM avx
ADS_XMM
+INIT_YMM avx2
+ADS_XMM
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
inc r1d
%endmacro
-INIT_MMX
+INIT_MMX mmx
cglobal pixel_ads_mvs, 0,7,0
-ads_mvs:
- lea r6, [r4+r5+15]
- and r6, ~15;
+ads_mvs_mmx:
; mvs = r4
; masks = r6
; width = r5
jge .end
.loopi:
mov r2, [r6+r1]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
test r2, r2
%else
mov r3, r2
TEST 1
TEST 2
TEST 3
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shr r2, 32
%else
mov r2d, [r6+r1]
.end:
movifnidn eax, r0d
RET
+
+INIT_XMM ssse3
+cglobal pixel_ads_mvs, 0,7,0
+ads_mvs_ssse3:
+ mova m3, [pw_8]
+ mova m4, [pw_76543210]
+ pxor m5, m5
+ add r5, r6
+ xor r0d, r0d ; nmv
+ mov [r5], r0d
+%ifdef PIC
+ lea r1, [$$]
+ %define GLOBAL +r1-$$
+%else
+ %define GLOBAL
+%endif
+.loop:
+ movh m0, [r6]
+ pcmpeqb m0, m5
+ pmovmskb r2d, m0
+ xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
+ movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
+ add r2d, r2d
+ ; shuffle counters based on mv mask
+ pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
+ movu [r4+r0*2], m2
+ add r0d, r3d
+ paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
+ add r6, 8
+ cmp r6, r5
+ jl .loop
+ movifnidn eax, r0d
+ RET