;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
-%include "x86inc.asm"
-%include "x86util.asm"
+%include "libavutil/x86/x86util.asm"
SECTION_RODATA
%endif
%endmacro
-%macro DEBLOCK_LUMA 1
+%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
+cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
mov r3, 32/mmsize
mov r2, r0
sub r0, r1
ADD rsp, pad
RET
-cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
+cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
mov r3, r1
mova am, m4
add r3, r1
RET
%endmacro
-INIT_XMM
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
SWAP 3, 9
%endmacro
-%macro DEBLOCK_LUMA_64 1
-cglobal deblock_v_luma_10_%1, 5,5,15
+%macro DEBLOCK_LUMA_64 0
+cglobal deblock_v_luma_10, 5,5,15
%define p2 m8
%define p1 m0
%define p0 m1
%define mask2 m11
shl r2d, 2
shl r3d, 2
- LOAD_AB m12, m13, r2, r3
+ LOAD_AB m12, m13, r2d, r3d
mov r2, r0
sub r0, r1
sub r0, r1
jg .loop
REP_RET
-cglobal deblock_h_luma_10_%1, 5,7,15
+cglobal deblock_h_luma_10, 5,7,15
shl r2d, 2
shl r3d, 2
- LOAD_AB m12, m13, r2, r3
+ LOAD_AB m12, m13, r2d, r3d
mov r2, r1
add r2, r1
add r2, r1
REP_RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA_64 sse2
-INIT_AVX
-DEBLOCK_LUMA_64 avx
+INIT_XMM sse2
+DEBLOCK_LUMA_64
+INIT_XMM avx
+DEBLOCK_LUMA_64
%endif
%macro SWAPMOVA 2
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
paddw t0, %3, %2
mova t2, %4
paddw t2, %3
LOAD_AB t0, t1, r2d, r3d
mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mova %2, t0 ; mask0
psrlw t3, %1, 2
%else
%endif
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA_INTRA_64 1
-cglobal deblock_v_luma_intra_10_%1, 4,7,16
+%macro DEBLOCK_LUMA_INTRA_64 0
+cglobal deblock_v_luma_intra_10, 4,7,16
%define t0 m1
%define t1 m2
%define t2 m4
shl r2d, 2
shl r3d, 2
LOAD_AB aa, bb, r2d, r3d
-.loop
+.loop:
mova p2, [r4+r1]
mova p1, [r4+2*r1]
mova p0, [r4+r5]
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10_%1, 4,7,16
+cglobal deblock_h_luma_intra_10, 4,7,16
%define t0 m15
%define t1 m14
%define t2 m2
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
-.loop
+.loop:
movu q3, [r0-8]
movu q2, [r0+r1-8]
movu q1, [r0+r1*2-8]
RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA_INTRA_64 sse2
-INIT_AVX
-DEBLOCK_LUMA_INTRA_64 avx
+INIT_XMM sse2
+DEBLOCK_LUMA_INTRA_64
+INIT_XMM avx
+DEBLOCK_LUMA_INTRA_64
%endif
-%macro DEBLOCK_LUMA_INTRA 1
+%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
+cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
+cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
RET
%endmacro
-%ifndef ARCH_X86_64
-INIT_MMX
-DEBLOCK_LUMA mmxext
-DEBLOCK_LUMA_INTRA mmxext
-INIT_XMM
-DEBLOCK_LUMA sse2
-DEBLOCK_LUMA_INTRA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
-DEBLOCK_LUMA_INTRA avx
+%if ARCH_X86_64 == 0
+INIT_MMX mmxext
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM sse2
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM avx
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
%endif
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
mova [r0+2*r1], m2
%endmacro
-%macro DEBLOCK_CHROMA 1
+%macro CHROMA_V_LOAD_TC 2
+ movd %1, [%2]
+ punpcklbw %1, %1
+ punpcklwd %1, %1
+ psraw %1, 6
+%endmacro
+
+%macro DEBLOCK_CHROMA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
+cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
mov r5, r0
sub r0, r1
sub r0, r1
.loop:
%endif
CHROMA_V_LOAD r5
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
- LOAD_TC m6, r4
+ CHROMA_V_LOAD_TC m6, r4
psubw m6, [pw_3]
pmaxsw m6, m4
pand m7, m6
%if mmsize < 16
add r0, mmsize
add r5, mmsize
- add r4, mmsize/8
+ add r4, mmsize/4
dec r6
jg .loop
REP_RET
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
+cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
mov r4, r0
sub r0, r1
sub r0, r1
.loop:
%endif
CHROMA_V_LOAD r4
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%endif
%endmacro
-%ifndef ARCH_X86_64
-INIT_MMX
-DEBLOCK_CHROMA mmxext
+%if ARCH_X86_64 == 0
+INIT_MMX mmxext
+DEBLOCK_CHROMA
%endif
-INIT_XMM
-DEBLOCK_CHROMA sse2
-INIT_AVX
-DEBLOCK_CHROMA avx
+INIT_XMM sse2
+DEBLOCK_CHROMA
+INIT_XMM avx
+DEBLOCK_CHROMA