%endif
%endmacro
-%macro DEBLOCK_LUMA 1
+%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
+cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
mov r3, 32/mmsize
mov r2, r0
sub r0, r1
ADD rsp, pad
RET
-cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
+cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
mov r3, r1
mova am, m4
add r3, r1
RET
%endmacro
-INIT_XMM
%if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
SWAP 3, 9
%endmacro
-%macro DEBLOCK_LUMA_64 1
-cglobal deblock_v_luma_10_%1, 5,5,15
+%macro DEBLOCK_LUMA_64 0
+cglobal deblock_v_luma_10, 5,5,15
%define p2 m8
%define p1 m0
%define p0 m1
%define mask2 m11
shl r2d, 2
shl r3d, 2
- LOAD_AB m12, m13, r2, r3
+ LOAD_AB m12, m13, r2d, r3d
mov r2, r0
sub r0, r1
sub r0, r1
jg .loop
REP_RET
-cglobal deblock_h_luma_10_%1, 5,7,15
+cglobal deblock_h_luma_10, 5,7,15
shl r2d, 2
shl r3d, 2
- LOAD_AB m12, m13, r2, r3
+ LOAD_AB m12, m13, r2d, r3d
mov r2, r1
add r2, r1
add r2, r1
REP_RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA_64 sse2
-INIT_AVX
-DEBLOCK_LUMA_64 avx
+INIT_XMM sse2
+DEBLOCK_LUMA_64
+INIT_XMM avx
+DEBLOCK_LUMA_64
%endif
%macro SWAPMOVA 2
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA_INTRA_64 1
-cglobal deblock_v_luma_intra_10_%1, 4,7,16
+%macro DEBLOCK_LUMA_INTRA_64 0
+cglobal deblock_v_luma_intra_10, 4,7,16
%define t0 m1
%define t1 m2
%define t2 m4
shl r2d, 2
shl r3d, 2
LOAD_AB aa, bb, r2d, r3d
-.loop
+.loop:
mova p2, [r4+r1]
mova p1, [r4+2*r1]
mova p0, [r4+r5]
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10_%1, 4,7,16
+cglobal deblock_h_luma_intra_10, 4,7,16
%define t0 m15
%define t1 m14
%define t2 m2
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
-.loop
+.loop:
movu q3, [r0-8]
movu q2, [r0+r1-8]
movu q1, [r0+r1*2-8]
RET
%endmacro
-INIT_XMM
-DEBLOCK_LUMA_INTRA_64 sse2
-INIT_AVX
-DEBLOCK_LUMA_INTRA_64 avx
+INIT_XMM sse2
+DEBLOCK_LUMA_INTRA_64
+INIT_XMM avx
+DEBLOCK_LUMA_INTRA_64
%endif
-%macro DEBLOCK_LUMA_INTRA 1
+%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
+cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
+cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
%endmacro
%if ARCH_X86_64 == 0
-INIT_MMX
-DEBLOCK_LUMA mmxext
-DEBLOCK_LUMA_INTRA mmxext
-INIT_XMM
-DEBLOCK_LUMA sse2
-DEBLOCK_LUMA_INTRA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
-DEBLOCK_LUMA_INTRA avx
+INIT_MMX mmx2
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM sse2
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM avx
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
%endif
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
psraw %1, 6
%endmacro
-%macro DEBLOCK_CHROMA 1
+%macro DEBLOCK_CHROMA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
+cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
mov r5, r0
sub r0, r1
sub r0, r1
.loop:
%endif
CHROMA_V_LOAD r5
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
CHROMA_V_LOAD_TC m6, r4
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
+cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
mov r4, r0
sub r0, r1
sub r0, r1
.loop:
%endif
CHROMA_V_LOAD r4
- LOAD_AB m4, m5, r2, r3
+ LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%endmacro
%if ARCH_X86_64 == 0
-INIT_MMX
-DEBLOCK_CHROMA mmxext
+INIT_MMX mmx2
+DEBLOCK_CHROMA
%endif
-INIT_XMM
-DEBLOCK_CHROMA sse2
-INIT_AVX
-DEBLOCK_CHROMA avx
+INIT_XMM sse2
+DEBLOCK_CHROMA
+INIT_XMM avx
+DEBLOCK_CHROMA