cextern pb_01
cextern pd_1024
-%macro QUANT_DC_START_MMX 0
+%macro QUANT_DC_START 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%ifdef HIGH_BIT_DEPTH
SPLATD m6, m6
SPLATD m7, m7
-%else
- SPLATW m6, m6
- SPLATW m7, m7
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro QUANT_DC_START_SSSE3 0
+%elif cpuflag(sse4) ; ssse3, but not faster on conroe
movdqa m5, [pb_01]
- movd m6, r1m ; mf
- movd m7, r2m ; bias
pshufb m6, m5
pshufb m7, m5
-%endmacro
-
-%macro PABSW_MMX 2
- pxor %1, %1
- pcmpgtw %1, %2
- pxor %2, %1
- psubw %2, %1
- SWAP %1, %2
-%endmacro
-
-%macro PSIGNW_MMX 2
- pxor %1, %2
- psubw %1, %2
-%endmacro
-
-%macro PABSW_SSSE3 2
- pabsw %1, %2
-%endmacro
-
-%macro PSIGNW_SSSE3 2
- psignw %1, %2
-%endmacro
-
-%macro PSIGND_MMX 2-3
-%if %0==3
- mova %1, %2
- pxor %1, %3
- psubd %1, %3
%else
- pxor %1, %2
- psubd %1, %2
+ SPLATW m6, m6
+ SPLATW m7, m7
%endif
%endmacro
-%macro PSIGND_SSSE3 2+
- psignd %1, %2
-%endmacro
-
-%macro PABSD_MMX 2
- pxor %1, %1
- pcmpgtd %1, %2
- pxor %2, %1
- psubd %2, %1
- SWAP %1, %2
-%endmacro
-
-%macro PABSD_SSSE3 2
- pabsd %1, %2
-%endmacro
-
-%macro QUANT_END_MMX 0
+%macro QUANT_END 0
+%if cpuflag(sse4)
+ xor eax, eax
+ ptest m5, m5
+ setne al
+%else ; !sse4
xor eax, eax
%ifdef ARCH_X86_64
%if mmsize == 16
%endif
%endif
setne al
-%endmacro
-
-%macro QUANT_END_SSE4 0
- xor eax, eax
- ptest m5, m5
- setne al
+%endif ; cpuflag
%endmacro
%ifdef HIGH_BIT_DEPTH
-%macro QUANT_ONE_DC_MMX 4
+%macro QUANT_ONE_DC 4
+%if cpuflag(sse4)
mova m0, [%1]
- PABSD m1, m0
+ ABSD m1, m0
paddd m1, %3
- mova m2, m1
- psrlq m2, 32
- pmuludq m1, %2
- pmuludq m2, %2
- psllq m2, 32
- paddd m1, m2
- psrld m1, 16
+ pmulld m1, %2
+ psrad m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
%else
SWAP 5, 1
%endif
-%endmacro
-
-%macro QUANT_TWO_DC_MMX 4
- QUANT_ONE_DC_MMX %1, %2, %3, %4
- QUANT_ONE_DC_MMX %1+mmsize, %2, %3, %4+mmsize
-%endmacro
-
-%macro QUANT_ONE_DC_SSE4 4
+%else ; !sse4
mova m0, [%1]
- PABSD m1, m0
+ ABSD m1, m0
paddd m1, %3
- pmulld m1, %2
- psrad m1, 16
+ mova m2, m1
+ psrlq m2, 32
+ pmuludq m1, %2
+ pmuludq m2, %2
+ psllq m2, 32
+ paddd m1, m2
+ psrld m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
%else
SWAP 5, 1
%endif
+%endif ; cpuflag
%endmacro
-%macro QUANT_TWO_DC_SSE4 4
+%macro QUANT_TWO_DC 4
+%if cpuflag(sse4)
mova m0, [%1]
mova m1, [%1+mmsize]
- PABSD m2, m0
- PABSD m3, m1
+ ABSD m2, m0
+ ABSD m3, m1
paddd m2, %3
paddd m3, %3
pmulld m2, %2
SWAP 5, 2
%endif
por m5, m3
+%else ; !sse4
+ QUANT_ONE_DC %1, %2, %3, %4
+ QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
+%endif ; cpuflag
%endmacro
%macro QUANT_ONE_AC_MMX 4
mova m0, [%1]
mova m2, [%2]
- PABSD m1, m0
+ ABSD m1, m0
mova m4, m2
paddd m1, [%3]
mova m3, m1
%endif
%endmacro
-%macro QUANT_TWO_AC_MMX 4
- QUANT_ONE_AC_MMX %1, %2, %3, %4
- QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
-%endmacro
-
-%macro QUANT_TWO_AC_SSE4 4
+%macro QUANT_TWO_AC 4
+%if cpuflag(sse4)
mova m0, [%1]
mova m1, [%1+mmsize]
- PABSD m2, m0
- PABSD m3, m1
+ ABSD m2, m0
+ ABSD m3, m1
paddd m2, [%3]
paddd m3, [%3+mmsize]
pmulld m2, [%2]
SWAP 5, 2
%endif
por m5, m3
+%else ; !sse4
+ QUANT_ONE_AC_MMX %1, %2, %3, %4
+ QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
+%endif ; cpuflag
%endmacro
;-----------------------------------------------------------------------------
; int quant_2x2( int32_t dct[M*N], int mf, int bias )
;-----------------------------------------------------------------------------
-%macro QUANT_DC 3
-cglobal quant_%1x%2_dc_%3, 3,3,8*(mmsize/16)
- QUANT_DC_START_MMX
+%macro QUANT_DC 2
+cglobal quant_%1x%2_dc, 3,3,8
+ QUANT_DC_START
%if %1*%2 <= mmsize/4
QUANT_ONE_DC r0, m6, m7, 0
%else
;-----------------------------------------------------------------------------
; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
;-----------------------------------------------------------------------------
-%macro QUANT_AC 3
-cglobal quant_%1x%2_%3, 3,3,8*(mmsize/16)
+%macro QUANT_AC 2
+cglobal quant_%1x%2, 3,3,8
%assign x 0
%rep %1*%2/(mmsize/2)
QUANT_TWO_AC r0+x, r1+x, r2+x, x
RET
%endmacro
-%define QUANT_TWO_AC QUANT_TWO_AC_MMX
-%define QUANT_ONE_DC QUANT_ONE_DC_MMX
-%define QUANT_TWO_DC QUANT_TWO_DC_MMX
-%define QUANT_END QUANT_END_MMX
-%define PABSD PABSD_MMX
-%define PSIGND PSIGND_MMX
-INIT_XMM
-QUANT_DC 2, 2, sse2
-QUANT_DC 4, 4, sse2
-QUANT_AC 4, 4, sse2
-QUANT_AC 8, 8, sse2
-
-%define PABSD PABSD_SSSE3
-%define PSIGND PSIGND_SSSE3
-QUANT_DC 2, 2, ssse3
-QUANT_DC 4, 4, ssse3
-QUANT_AC 4, 4, ssse3
-QUANT_AC 8, 8, ssse3
-
-%define QUANT_TWO_AC QUANT_TWO_AC_SSE4
-%define QUANT_ONE_DC QUANT_ONE_DC_SSE4
-%define QUANT_TWO_DC QUANT_TWO_DC_SSE4
-%define QUANT_END QUANT_END_SSE4
-QUANT_DC 2, 2, sse4
-QUANT_DC 4, 4, sse4
-QUANT_AC 4, 4, sse4
-QUANT_AC 8, 8, sse4
-
-%undef SIGND
-%undef PABSD
-%undef QUANT_END
-%undef QUANT_TWO_AC
-%undef QUANT_ONE_DC
-%undef QUANT_TWO_DC
+INIT_XMM sse2
+QUANT_DC 2, 2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
+INIT_XMM ssse3
+QUANT_DC 2, 2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
+INIT_XMM sse4
+QUANT_DC 2, 2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
mova m1, %1 ; load dct coeffs
- PABSW m0, m1
+ ABSW m0, m1, sign
paddusw m0, %3 ; round
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
%macro QUANT_TWO 7
mova m1, %1
mova m3, %2
- PABSW m0, m1
- PABSW m2, m3
+ ABSW m0, m1, sign
+ ABSW m2, m3, sign
paddusw m0, %5
paddusw m2, %6
pmulhuw m0, %3
RET
%endmacro
-INIT_MMX
-%define QUANT_END QUANT_END_MMX
-%define PABSW PABSW_MMX
-%define PSIGNW PSIGNW_MMX
-%define QUANT_DC_START QUANT_DC_START_MMX
-QUANT_DC quant_2x2_dc_mmx2, 1
+INIT_MMX mmx2
+QUANT_DC quant_2x2_dc, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
-QUANT_DC quant_4x4_dc_mmx2, 4
-QUANT_AC quant_4x4_mmx, 4
-QUANT_AC quant_8x8_mmx, 16
+QUANT_DC quant_4x4_dc, 4
+INIT_MMX mmx
+QUANT_AC quant_4x4, 4
+QUANT_AC quant_8x8, 16
%endif
-INIT_XMM
-QUANT_DC quant_4x4_dc_sse2, 2, 8
-QUANT_AC quant_4x4_sse2, 2
-QUANT_AC quant_8x8_sse2, 8
+INIT_XMM sse2
+QUANT_DC quant_4x4_dc, 2, 8
+QUANT_AC quant_4x4, 2
+QUANT_AC quant_8x8, 8
+
+INIT_XMM ssse3
+QUANT_DC quant_4x4_dc, 2, 8
+QUANT_AC quant_4x4, 2
+QUANT_AC quant_8x8, 8
-%define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
-QUANT_DC quant_4x4_dc_ssse3, 2, 8
-QUANT_AC quant_4x4_ssse3, 2
-QUANT_AC quant_8x8_ssse3, 8
+INIT_MMX ssse3
+QUANT_DC quant_2x2_dc, 1
-INIT_MMX
-QUANT_DC quant_2x2_dc_ssse3, 1
-%define QUANT_END QUANT_END_SSE4
+INIT_XMM sse4
;Not faster on Conroe, so only used in SSE4 versions
-%define QUANT_DC_START QUANT_DC_START_SSSE3
-INIT_XMM
-QUANT_DC quant_4x4_dc_sse4, 2, 8
-QUANT_AC quant_4x4_sse4, 2
-QUANT_AC quant_8x8_sse4, 8
+QUANT_DC quant_4x4_dc, 2, 8
+QUANT_AC quant_4x4, 2
+QUANT_AC quant_8x8, 8
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
-%macro DEQUANT 4
-cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
+%macro DEQUANT 3
+cglobal dequant_%1x%1, 0,3,6
.skip_prologue:
- DEQUANT_START %3+2, %3
+ DEQUANT_START %2+2, %2
.lshift:
movd m2, t0d
- DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
+ DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
.rshift32:
neg t0d
pxor m4, m4
pslld m3, m2
psrld m3, 1
- DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
+ DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
-%ifnidn %1, avx
-cglobal dequant_%2x%2_flat16_%1, 0,3
+%ifndef HIGH_BIT_DEPTH
+%if notcpuflag(avx)
+cglobal dequant_%1x%1_flat16, 0,3
movifnidn t2d, r2m
-%if %2 == 8
+%if %1 == 8
cmp t2d, 12
- jl dequant_%2x%2_%1.skip_prologue
+ jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
sub t2d, 12
%endif
imul t0d, t2d, 0x2b
lea t1, [t0*3]
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %3
+ shl t2d, %2
%ifdef PIC
- lea r1, [dequant%2_scale]
+ lea r1, [dequant%1_scale]
add r1, t2
%else
- lea r1, [dequant%2_scale + t2]
+ lea r1, [dequant%1_scale + t2]
%endif
movifnidn r0, r0mp
movd m4, t0d
-%if %2 == 4
-%ifidn %1, mmx
+%if %1 == 4
+%if mmsize == 8
DEQUANT16_FLAT [r1], 0, 16
DEQUANT16_FLAT [r1+8], 8, 24
%else
DEQUANT16_FLAT [r1], 0, 16
%endif
-%elifidn %1, mmx
+%elif mmsize == 8
DEQUANT16_FLAT [r1], 0, 8, 64, 72
DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
%endif
RET
%endif ; !AVX
+%endif ; !HIGH_BIT_DEPTH
%endmacro ; DEQUANT
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-DEQUANT sse2, 4, 4, 1
-DEQUANT sse4, 4, 4, 1
-DEQUANT sse2, 8, 6, 1
-DEQUANT sse4, 8, 6, 1
+INIT_XMM sse2
+DEQUANT 4, 4, 1
+DEQUANT 8, 6, 1
%else
%ifndef ARCH_X86_64
-INIT_MMX
-DEQUANT mmx, 4, 4, 1
-DEQUANT mmx, 8, 6, 1
+INIT_MMX mmx
+DEQUANT 4, 4, 1
+DEQUANT 8, 6, 1
%endif
-INIT_XMM
-DEQUANT sse2, 4, 4, 2
-DEQUANT sse2, 8, 6, 2
-INIT_AVX
-DEQUANT avx, 4, 4, 2
-DEQUANT avx, 8, 6, 2
+INIT_XMM sse2
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
+INIT_XMM avx
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
%endif
%macro DEQUANT_DC 2
-cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
+cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
.lshift:
movd m3, [r1]
movd m2, t0d
pslld m3, m2
-%ifdef HIGH_BIT_DEPTH
- pshufd m3, m3, 0
-%assign x 0
-%rep SIZEOF_PIXEL*16/mmsize
- mova m0, [r0+mmsize*0+x]
- mova m1, [r0+mmsize*1+x]
- pmaddwd m0, m3
- pmaddwd m1, m3
- mova [r0+mmsize*0+x], m0
- mova [r0+mmsize*1+x], m1
-%assign x x+mmsize*2
-%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if mmsize==16
- pshuflw m3, m3, 0
- punpcklqdq m3, m3
-%else
- pshufw m3, m3, 0
-%endif
+ SPLAT%1 m3, m3, 0
%assign x 0
%rep SIZEOF_PIXEL*16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
- pmullw m0, m3
- pmullw m1, m3
+ %2 m0, m3
+ %2 m1, m3
mova [r0+mmsize*0+x], m0
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
-%endif ; HIGH_BIT_DEPTH
RET
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [p%2_1]
+ mova m4, [p%1_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
%assign x x+mmsize
%endrep
-%else
+%else ; !HIGH_BIT_DEPTH
%if mmsize==8
punpcklwd m2, m2
%else
mova [r0+x], m0
%assign x x+mmsize
%endrep
-%endif
+%endif ; !HIGH_BIT_DEPTH
RET
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-DEQUANT_DC sse2, d
-DEQUANT_DC sse4, d
-INIT_AVX
-DEQUANT_DC avx, d
+INIT_XMM sse2
+DEQUANT_DC d, pmaddwd
%else
-INIT_MMX
-DEQUANT_DC mmx2, w
-INIT_XMM
-DEQUANT_DC sse2, w
-INIT_AVX
-DEQUANT_DC avx, w
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+DEQUANT_DC w, pmullw
+%endif
+INIT_XMM sse2
+DEQUANT_DC w, pmullw
+INIT_XMM avx
+DEQUANT_DC w, pmullw
%endif
; t4 is eax for return value.
; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
;-----------------------------------------------------------------------------
-; %2 == 1 for sse2 or ssse3, 0 for sse4/avx
-%macro OPTIMIZE_CHROMA_DC 2
-%assign %%regs 4+%2
+%macro OPTIMIZE_CHROMA_DC 0
+%assign %%regs 5
+%if cpuflag(sse4)
+ %assign %%regs %%regs-1
+%endif
%ifndef ARCH_X86_64
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
-cglobal optimize_chroma_dc_%1, 0,%%regs,7
+cglobal optimize_chroma_dc, 0,%%regs,7
movifnidn t0, r0mp
movd m2, r1m
movq m1, [t0]
-%if %2
- pxor m4, m4
-%else ; sse4, avx
+%if cpuflag(sse4)
pcmpeqb m4, m4
pslld m4, 11
-%endif
-%ifidn %1, sse2
- mova m3, [chroma_dc_dct_mask_mmx]
- mova m5, [chroma_dc_dmf_mask_mmx]
%else
+ pxor m4, m4
+%endif
+%if cpuflag(ssse3)
mova m3, [chroma_dc_dct_mask]
mova m5, [chroma_dc_dmf_mask]
+%else
+ mova m3, [chroma_dc_dct_mask_mmx]
+ mova m5, [chroma_dc_dmf_mask_mmx]
%endif
pshuflw m2, m2, 0
pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2
mov t1d, 3
paddd m0, m6
xor t4d, t4d
-%ifidn %1, sse2
+%if notcpuflag(ssse3)
psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
%endif
-%if %2
+%if cpuflag(sse4)
+ ptest m0, m4
+%else
mova m6, m0
SWAP 0, 6
psrad m6, 11
pcmpeqd m6, m4
pmovmskb t5d, m6
cmp t5d, 0xffff
-%else ; sse4, avx
- ptest m0, m4
%endif
jz .ret ; if the DC coefficients already round to zero, terminate early
mova m3, m0
.inner_loop:
psubd m3, m5 ; coeff -= sign
pxor m6, m0, m3
-%if %2
+%if cpuflag(sse4)
+ ptest m6, m4
+%else
psrad m6, 11
pcmpeqd m6, m4
pmovmskb t5d, m6
cmp t5d, 0xffff
-%else ; sse4, avx
- ptest m6, m4
%endif
jz .round_coeff
paddd m3, m5 ; coeff += sign
REP_RET
%endmacro
-INIT_XMM
-%define PSIGNW PSIGNW_MMX
-%define PSIGND PSIGND_MMX
-OPTIMIZE_CHROMA_DC sse2, 1
-%define PSIGNW PSIGNW_SSSE3
-%define PSIGND PSIGND_SSSE3
-OPTIMIZE_CHROMA_DC ssse3, 1
-OPTIMIZE_CHROMA_DC sse4, 0
-INIT_AVX
-OPTIMIZE_CHROMA_DC avx, 0
+%ifndef HIGH_BIT_DEPTH
+INIT_XMM sse2
+OPTIMIZE_CHROMA_DC
+INIT_XMM ssse3
+OPTIMIZE_CHROMA_DC
+INIT_XMM sse4
+OPTIMIZE_CHROMA_DC
+INIT_XMM avx
+OPTIMIZE_CHROMA_DC
+%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,4,%2
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,8
pxor m6, m6
.loop:
sub r3, mmsize/2
mova m2, [r0+r3*4+0*mmsize]
mova m3, [r0+r3*4+1*mmsize]
- PABSD m0, m2
- PABSD m1, m3
+ ABSD m0, m2
+ ABSD m1, m3
mova m4, m0
mova m5, m1
psubd m0, [r2+r3*4+0*mmsize]
REP_RET
%endmacro
-%define PABSD PABSD_MMX
-%define PSIGND PSIGND_MMX
%ifndef ARCH_X86_64
-INIT_MMX
-DENOISE_DCT mmx
+INIT_MMX mmx
+DENOISE_DCT
%endif
-INIT_XMM
-DENOISE_DCT sse2, 8
-%define PABSD PABSD_SSSE3
-%define PSIGND PSIGND_SSSE3
-DENOISE_DCT ssse3, 8
-INIT_AVX
-DENOISE_DCT avx , 8
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,4,%2
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,7
pxor m6, m6
.loop:
sub r3, mmsize
mova m2, [r0+r3*2+0*mmsize]
mova m3, [r0+r3*2+1*mmsize]
- PABSW m0, m2
- PABSW m1, m3
+ ABSW m0, m2, sign
+ ABSW m1, m3, sign
psubusw m4, m0, [r2+r3*2+0*mmsize]
psubusw m5, m1, [r2+r3*2+1*mmsize]
PSIGNW m4, m2
REP_RET
%endmacro
-%define PABSW PABSW_MMX
-%define PSIGNW PSIGNW_MMX
%ifndef ARCH_X86_64
-INIT_MMX
-DENOISE_DCT mmx
+INIT_MMX mmx
+DENOISE_DCT
%endif
-INIT_XMM
-DENOISE_DCT sse2, 7
-%define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
-DENOISE_DCT ssse3, 7
-INIT_AVX
-DENOISE_DCT avx, 7
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
%endif ; !HIGH_BIT_DEPTH
; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------
-%macro DECIMATE_MASK_SSE2 7
+%macro DECIMATE_MASK 5
+%if mmsize==16
%ifdef HIGH_BIT_DEPTH
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+32]
packssdw xmm0, [%3+16]
packssdw xmm1, [%3+48]
-%if %7
- pabsw xmm0, xmm0
- pabsw xmm1, xmm1
+ ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
%else
- ABS2_MMX xmm0, xmm1, xmm3, xmm4
-%endif
-%else
-%if %7
- pabsw xmm0, [%3+ 0]
- pabsw xmm1, [%3+16]
-%else
- movdqa xmm0, [%3+ 0]
- movdqa xmm1, [%3+16]
- ABS2_MMX xmm0, xmm1, xmm3, xmm4
-%endif
+ ABSW xmm0, [%3+ 0], xmm3
+ ABSW xmm1, [%3+16], xmm4
%endif
packsswb xmm0, xmm1
pxor xmm2, xmm2
pcmpgtb xmm0, %4
pmovmskb %1, xmm2
pmovmskb %2, xmm0
-%endmacro
-%macro DECIMATE_MASK_MMX 7
+%else ; mmsize==8
%ifdef HIGH_BIT_DEPTH
movq mm0, [%3+ 0]
movq mm1, [%3+16]
movq mm2, [%3+16]
movq mm3, [%3+24]
%endif
- ABS2_MMX mm0, mm1, mm6, mm7
- ABS2_MMX mm2, mm3, mm6, mm7
+ ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
+ ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
packsswb mm0, mm1
packsswb mm2, mm3
pxor mm4, mm4
pcmpeqb mm6, mm2
pcmpgtb mm0, %4
pcmpgtb mm2, %4
- pmovmskb %6, mm4
+ pmovmskb %5, mm4
pmovmskb %1, mm6
shl %1, 8
- or %1, %6
- pmovmskb %6, mm0
+ or %1, %5
+ pmovmskb %5, mm0
pmovmskb %2, mm2
shl %2, 8
- or %2, %6
+ or %2, %5
+%endif
%endmacro
cextern decimate_table4
cextern decimate_table8
-%macro DECIMATE4x4 4
+%macro DECIMATE4x4 1
;A LUT is faster than bsf on AMD processors.
;This is not true for score64.
-cglobal decimate_score%1_%2, 1,3
+cglobal decimate_score%1, 1,3
%ifdef PIC
lea r10, [decimate_table4]
lea r11, [decimate_mask_table4]
%define table decimate_table4
%define mask_table decimate_mask_table4
%endif
- DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
+ DECIMATE_MASK edx, eax, r0, [pb_1], ecx
xor edx, 0xffff
je .ret
test eax, eax
%if %1==15
shr edx, 1
%endif
-%if %3==1
+%if cpuflag(slowctz)
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
%endmacro
%ifndef ARCH_X86_64
-INIT_MMX
-%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmx2, 0, 0
-DECIMATE4x4 16, mmx2, 0, 0
-DECIMATE4x4 15, mmx2_slowctz, 1, 0
-DECIMATE4x4 16, mmx2_slowctz, 1, 0
+INIT_MMX mmx2
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_MMX mmx2, slowctz
+DECIMATE4x4 15
+DECIMATE4x4 16
%endif
-INIT_XMM
-%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2, 0, 0
-DECIMATE4x4 16, sse2, 0, 0
-DECIMATE4x4 15, sse2_slowctz, 1, 0
-DECIMATE4x4 16, sse2_slowctz, 1, 0
-DECIMATE4x4 15, ssse3, 0, 1
-DECIMATE4x4 16, ssse3, 0, 1
-DECIMATE4x4 15, ssse3_slowctz, 1, 1
-DECIMATE4x4 16, ssse3_slowctz, 1, 1
-
-%macro DECIMATE8x8 2
+INIT_XMM sse2
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_XMM sse2, slowctz
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_XMM ssse3
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_XMM ssse3, slowctz
+DECIMATE4x4 15
+DECIMATE4x4 16
+
+%macro DECIMATE8x8 0
%ifdef ARCH_X86_64
-cglobal decimate_score64_%1, 1,4
+cglobal decimate_score64, 1,4
%ifdef PIC
lea r10, [decimate_table8]
%define table r10
%define table decimate_table8
%endif
mova m5, [pb_1]
- DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
+ DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
test eax, eax
jne .ret9
- DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
+ DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
shl r2d, 16
or r1d, r2d
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
shl r2, 32
or eax, r3d
or r1, r2
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
shl r2, 48
or r1, r2
xor r1, -1
RET
%else ; ARCH
-%ifidn %1, mmx2
-cglobal decimate_score64_%1, 1,6
+%if mmsize == 8
+cglobal decimate_score64, 1,6
%else
-cglobal decimate_score64_%1, 1,5
+cglobal decimate_score64, 1,5
%endif
mova m5, [pb_1]
- DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
+ DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
test r2, r2
jne .ret9
- DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
+ DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
shl r4, 16
or r3, r4
- DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
+ DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
or r2, r1
- DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
+ DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
shl r1, 16
or r4, r1
xor r3, -1
%endmacro
%ifndef ARCH_X86_64
-INIT_MMX
-%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE8x8 mmx2, 0
+INIT_MMX mmx2
+DECIMATE8x8
%endif
-INIT_XMM
-%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE8x8 sse2, 0
-DECIMATE8x8 ssse3, 1
+INIT_XMM sse2
+DECIMATE8x8
+INIT_XMM ssse3
+DECIMATE8x8
;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
;-----------------------------------------------------------------------------
-%macro LAST_X86 3
+%macro BSR 3
+%if cpuflag(lzcnt)
+ lzcnt %1, %2
+ xor %1, %3
+%else
bsr %1, %2
+%endif
%endmacro
-%macro LAST_SSE4A 3
+%macro LZCOUNT 3
+%if cpuflag(lzcnt)
lzcnt %1, %2
+%else
+ bsr %1, %2
xor %1, %3
+%endif
%endmacro
%ifdef HIGH_BIT_DEPTH
-%macro LAST_MASK4_MMX 2-3
- movq mm0, [%2]
- packssdw mm0, [%2+8]
+%macro LAST_MASK 3-4
+%if %1 == 4
+ movq mm0, [%3]
+ packssdw mm0, [%3+8]
packsswb mm0, mm0
pcmpeqb mm0, mm2
- pmovmskb %1, mm0
-%endmacro
-
-%macro LAST_MASK_SSE2 2-3
- movdqa xmm0, [%2+ 0]
- movdqa xmm1, [%2+32]
- packssdw xmm0, [%2+16]
- packssdw xmm1, [%2+48]
+ pmovmskb %2, mm0
+%elif mmsize == 16
+ movdqa xmm0, [%3+ 0]
+ movdqa xmm1, [%3+32]
+ packssdw xmm0, [%3+16]
+ packssdw xmm1, [%3+48]
packsswb xmm0, xmm1
pcmpeqb xmm0, xmm2
- pmovmskb %1, xmm0
-%endmacro
-
-%macro LAST_MASK_MMX 3
- movq mm0, [%2+ 0]
- movq mm1, [%2+16]
- packssdw mm0, [%2+ 8]
- packssdw mm1, [%2+24]
- movq mm3, [%2+32]
- movq mm4, [%2+48]
- packssdw mm3, [%2+40]
- packssdw mm4, [%2+56]
+ pmovmskb %2, xmm0
+%else
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ packssdw mm0, [%3+ 8]
+ packssdw mm1, [%3+24]
+ movq mm3, [%3+32]
+ movq mm4, [%3+48]
+ packssdw mm3, [%3+40]
+ packssdw mm4, [%3+56]
packsswb mm0, mm1
packsswb mm3, mm4
pcmpeqb mm0, mm2
pcmpeqb mm3, mm2
- pmovmskb %1, mm0
- pmovmskb %3, mm3
- shl %3, 8
- or %1, %3
+ pmovmskb %2, mm0
+ pmovmskb %4, mm3
+ shl %4, 8
+ or %2, %4
+%endif
%endmacro
-%macro COEFF_LAST4 1
-cglobal coeff_last4_%1, 1,3
+%macro COEFF_LAST4 0
+cglobal coeff_last4, 1,3
pxor mm2, mm2
- LAST_MASK4_MMX r1d, r0
+ LAST_MASK 4, r1d, r0
xor r1d, 0xff
shr r1d, 4
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
RET
%endmacro
-%define LAST LAST_X86
-COEFF_LAST4 mmx2
-%define LAST LAST_SSE4A
-COEFF_LAST4 mmx2_lzcnt
+INIT_MMX mmx2
+COEFF_LAST4
+INIT_MMX mmx2, lzcnt
+COEFF_LAST4
%else ; !HIGH_BIT_DEPTH
-%macro LAST_MASK4_MMX 2-3
- movq mm0, [%2]
+%macro LAST_MASK 3-4
+%if %1 == 4
+ movq mm0, [%3]
packsswb mm0, mm0
pcmpeqb mm0, mm2
- pmovmskb %1, mm0
-%endmacro
-
-%macro LAST_MASK_SSE2 2-3
- movdqa xmm0, [%2+ 0]
- packsswb xmm0, [%2+16]
+ pmovmskb %2, mm0
+%elif mmsize == 16
+ movdqa xmm0, [%3+ 0]
+ packsswb xmm0, [%3+16]
pcmpeqb xmm0, xmm2
- pmovmskb %1, xmm0
-%endmacro
-
-%macro LAST_MASK_MMX 3
- movq mm0, [%2+ 0]
- movq mm1, [%2+16]
- packsswb mm0, [%2+ 8]
- packsswb mm1, [%2+24]
+ pmovmskb %2, xmm0
+%else
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ packsswb mm0, [%3+ 8]
+ packsswb mm1, [%3+24]
pcmpeqb mm0, mm2
pcmpeqb mm1, mm2
- pmovmskb %1, mm0
- pmovmskb %3, mm1
- shl %3, 8
- or %1, %3
+ pmovmskb %2, mm0
+ pmovmskb %4, mm1
+ shl %4, 8
+ or %2, %4
+%endif
%endmacro
-%macro COEFF_LAST4 1
+%macro COEFF_LAST4 0
%ifdef ARCH_X86_64
-cglobal coeff_last4_%1, 1,1
- LAST rax, [r0], 0x3f
+cglobal coeff_last4, 1,1
+ BSR rax, [r0], 0x3f
shr eax, 4
RET
%else
-cglobal coeff_last4_%1, 0,3
+cglobal coeff_last4, 0,3
mov edx, r0mp
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
- LAST eax, eax, 0x1f
+ BSR eax, eax, 0x1f
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
%endmacro
-%define LAST LAST_X86
-COEFF_LAST4 mmx2
-%define LAST LAST_SSE4A
-COEFF_LAST4 mmx2_lzcnt
+INIT_MMX mmx2
+COEFF_LAST4
+INIT_MMX mmx2, lzcnt
+COEFF_LAST4
%endif ; HIGH_BIT_DEPTH
-%macro COEFF_LAST 1
-cglobal coeff_last15_%1, 1,3
+%macro COEFF_LAST 0
+cglobal coeff_last15, 1,3
pxor m2, m2
- LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
+ LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
xor r1d, 0xffff
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
dec eax
RET
-cglobal coeff_last16_%1, 1,3
+cglobal coeff_last16, 1,3
pxor m2, m2
- LAST_MASK r1d, r0, r2d
+ LAST_MASK 16, r1d, r0, r2d
xor r1d, 0xffff
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
RET
%ifndef ARCH_X86_64
-cglobal coeff_last64_%1, 1, 5-mmsize/16
+cglobal coeff_last64, 1, 5-mmsize/16
pxor m2, m2
- LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
- LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
+ LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
shl r3d, 16
or r2d, r3d
xor r2d, -1
jne .secondhalf
- LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
- LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
+ LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
shl r3d, 16
or r1d, r3d
not r1d
- LAST eax, r1d, 0x1f
+ BSR eax, r1d, 0x1f
RET
.secondhalf:
- LAST eax, r2d, 0x1f
+ BSR eax, r2d, 0x1f
add eax, 32
RET
%else
-cglobal coeff_last64_%1, 1,4
+cglobal coeff_last64, 1,4
pxor m2, m2
- LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
- LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
- LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
- LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
+ LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
+ LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
shl r2d, 16
shl r0d, 16
or r1d, r2d
shl r3, 32
or r1, r3
not r1
- LAST rax, r1, 0x3f
+ BSR rax, r1, 0x3f
RET
%endif
%endmacro
-%define LAST LAST_X86
%ifndef ARCH_X86_64
-INIT_MMX
-%define LAST_MASK LAST_MASK_MMX
-COEFF_LAST mmx2
+INIT_MMX mmx2
+COEFF_LAST
%endif
-INIT_XMM
-%define LAST_MASK LAST_MASK_SSE2
-COEFF_LAST sse2
-%define LAST LAST_SSE4A
-COEFF_LAST sse2_lzcnt
+INIT_XMM sse2
+COEFF_LAST
+INIT_XMM sse2, lzcnt
+COEFF_LAST
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
-%macro LZCOUNT_X86 3
- bsr %1, %2
- xor %1, %3
-%endmacro
-
-%macro LZCOUNT_SSE4A 3
- lzcnt %1, %2
-%endmacro
-
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6
DECLARE_REG_TMP 6,3,2,1,4,5,0
%endif
-%macro COEFF_LEVELRUN 2
-cglobal coeff_level_run%2_%1,0,7
+%macro COEFF_LEVELRUN 1
+cglobal coeff_level_run%1,0,7
movifnidn t0, r0mp
movifnidn t1, r1mp
pxor m2, m2
- LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
+ LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
not t5d
- shl t5d, 32-((%2+1)&~1)
- mov t4d, %2-1
+ shl t5d, 32-((%1+1)&~1)
+ mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
add t5d, t5d
REP_RET
%endmacro
-INIT_MMX
-%define LZCOUNT LZCOUNT_X86
+INIT_MMX mmx2
%ifndef ARCH_X86_64
-%define LAST_MASK LAST_MASK_MMX
-COEFF_LEVELRUN mmx2, 15
-COEFF_LEVELRUN mmx2, 16
+COEFF_LEVELRUN 15
+COEFF_LEVELRUN 16
%endif
-%define LAST_MASK LAST_MASK4_MMX
-COEFF_LEVELRUN mmx2, 4
-INIT_XMM
-%define LAST_MASK LAST_MASK_SSE2
-COEFF_LEVELRUN sse2, 15
-COEFF_LEVELRUN sse2, 16
-%define LZCOUNT LZCOUNT_SSE4A
-COEFF_LEVELRUN sse2_lzcnt, 15
-COEFF_LEVELRUN sse2_lzcnt, 16
-INIT_MMX
-%define LAST_MASK LAST_MASK4_MMX
-COEFF_LEVELRUN mmx2_lzcnt, 4
+COEFF_LEVELRUN 4
+INIT_XMM sse2
+COEFF_LEVELRUN 15
+COEFF_LEVELRUN 16
+INIT_XMM sse2, lzcnt
+COEFF_LEVELRUN 15
+COEFF_LEVELRUN 16
+INIT_MMX mmx2, lzcnt
+COEFF_LEVELRUN 4