%macro QUANT_DC_START 0
movd m6, r1m ; mf
movd m7, r2m ; bias
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
SPLATD m6, m6
SPLATD m7, m7
%elif cpuflag(sse4) ; ssse3, but not faster on conroe
- movdqa m5, [pb_01]
+ mova m5, [pb_01]
pshufb m6, m5
pshufb m7, m5
%else
%endmacro
%macro QUANT_END 0
-%if cpuflag(sse4)
xor eax, eax
+%if cpuflag(sse4)
ptest m5, m5
- setne al
%else ; !sse4
- xor eax, eax
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%if mmsize == 16
packsswb m5, m5
%endif
test ecx, ecx
%endif
%endif
- setne al
%endif ; cpuflag
+ setne al
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro QUANT_ONE_DC 4
%if cpuflag(sse4)
mova m0, [%1]
paddd m1, %3
pmulld m1, %2
psrad m1, 16
- PSIGND m1, m0
- mova [%1], m1
- ACCUM por, 5, 1, %4
%else ; !sse4
mova m0, [%1]
ABSD m1, m0
psllq m2, 32
paddd m1, m2
psrld m1, 16
+%endif ; cpuflag
PSIGND m1, m0
mova [%1], m1
ACCUM por, 5, 1, %4
-%endif ; cpuflag
%endmacro
%macro QUANT_TWO_DC 4
%if cpuflag(sse4)
- mova m0, [%1]
+ mova m0, [%1 ]
mova m1, [%1+mmsize]
ABSD m2, m0
ABSD m3, m1
psrad m3, 16
PSIGND m2, m0
PSIGND m3, m1
- mova [%1], m2
- mova [%1+mmsize], m3
+ mova [%1 ], m2
+ mova [%1+mmsize], m3
ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
%macro QUANT_TWO_AC 4
%if cpuflag(sse4)
- mova m0, [%1]
+ mova m0, [%1 ]
mova m1, [%1+mmsize]
ABSD m2, m0
ABSD m3, m1
- paddd m2, [%3]
+ paddd m2, [%3 ]
paddd m3, [%3+mmsize]
- pmulld m2, [%2]
+ pmulld m2, [%2 ]
pmulld m3, [%2+mmsize]
psrad m2, 16
psrad m3, 16
PSIGND m2, m0
PSIGND m3, m1
- mova [%1], m2
- mova [%1+mmsize], m3
+ mova [%1 ], m2
+ mova [%1+mmsize], m3
ACCUM por, 5, 2, %4
por m5, m3
%else ; !sse4
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro QUANT_ONE 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
INIT_MMX mmx2
QUANT_DC quant_2x2_dc, 1
-%ifndef ARCH_X86_64 ; not needed because sse2 is faster
+%if ARCH_X86_64 == 0 ; not needed because sse2 is faster
QUANT_DC quant_4x4_dc, 4
INIT_MMX mmx
QUANT_AC quant_4x4, 4
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
mova m0, %2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmaddwd m0, %1
pslld m0, m2
%else
;;; m3 f
;;; m4 0
mova m0, %1
-%ifdef HIGH_BIT_DEPTH
- pmadcswd m0, m0, %2, m3
+%if HIGH_BIT_DEPTH
+ pmadcswd m0, m0, %2, m3
psrad m0, m2
%else
punpckhwd m1, m0, m4
%endrep
%endmacro
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 6,3,2
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
DECLARE_REG_TMP 4,3,2
%else
DECLARE_REG_TMP 2,0,1
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
add r1, t2 ; dequant_mf[i_mf]
%else
add r1, r1mp ; dequant_mf[i_mf]
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
-%ifndef HIGH_BIT_DEPTH
-%if notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
cglobal dequant_%1x%1_flat16, 0,3
movifnidn t2d, r2m
%if %1 == 8
DEQUANT16_FLAT [r1+32], 32, 96
%endif
RET
-%endif ; !AVX
-%endif ; !HIGH_BIT_DEPTH
+%endif ; !HIGH_BIT_DEPTH && !AVX
%endmacro ; DEQUANT
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
%else
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
psrld m4, 1
movd m2, [r1]
%assign x 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pshufd m2, m2, 0
%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT_DC d, pmaddwd
INIT_XMM xop
DEQUANT_DC d, pmaddwd
%else
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEQUANT_DC w, pmullw
%endif
%endif
; t4 is eax for return value.
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
%else
DECLARE_REG_TMP 4,1,2,3,0,5
%if cpuflag(sse4)
%assign %%regs %%regs-1
%endif
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
cglobal optimize_chroma_2x2_dc, 0,%%regs,7
REP_RET
%endmacro
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_XMM sse2
OPTIMIZE_CHROMA_2x2_DC
INIT_XMM ssse3
OPTIMIZE_CHROMA_2x2_DC
%endif ; !HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 0
cglobal denoise_dct, 4,4,8
pxor m6, m6
+ movsxdifnidn r3, r3d
.loop:
mova m2, [r0+r3*4-2*mmsize]
mova m3, [r0+r3*4-1*mmsize]
REP_RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx
DENOISE_DCT
%endif
%macro DENOISE_DCT 0
cglobal denoise_dct, 4,4,7
pxor m6, m6
+ movsxdifnidn r3, r3d
.loop:
mova m2, [r0+r3*2-2*mmsize]
mova m3, [r0+r3*2-1*mmsize]
REP_RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx
DENOISE_DCT
%endif
%macro DECIMATE_MASK 5
%if mmsize==16
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+32]
packssdw xmm0, [%3+16]
pmovmskb %2, xmm0
%else ; mmsize==8
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movq mm0, [%3+ 0]
movq mm1, [%3+16]
movq mm2, [%3+32]
%macro DECIMATE4x4 1
-;A LUT is faster than bsf on AMD processors.
+;A LUT is faster than bsf on older AMD processors.
;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
add al, byte [mask_table + rdx]
%else
.loop:
- bsf ecx, edx
+ tzcnt ecx, edx
shr edx, cl
add al, byte [table + rcx]
shr edx, 1
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE4x4 15
DECIMATE4x4 16
%macro DECIMATE8x8 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
cglobal decimate_score64, 1,5
%ifdef PIC
lea r4, [decimate_table8]
add eax, r3d
jne .ret9
.loop:
- bsf rcx, r1
+ tzcnt rcx, r1
shr r1, cl
add al, byte [table + rcx]
shr r1, 1
add r0, r2
jne .ret9 ;r0 is zero at this point, so we don't need to zero it
.loop:
- bsf ecx, r3
+ tzcnt ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
.largerun:
mov r3, r4
xor r4, r4
- bsf ecx, r3
+ tzcnt ecx, r3
shr r3, cl
shr r3, 1
jne .loop
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE8x8
%endif
%endif
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro LAST_MASK 3-4
%if %1 == 4
movq mm0, [%3]
RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
COEFF_LAST8
%endif
%endmacro
%macro COEFF_LAST48 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
cglobal coeff_last4, 1,1
BSR rax, [r0], 0x3f
shr eax, 4
BSR eax, r1d, 0x1f
RET
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
cglobal coeff_last64, 1, 5-mmsize/16
pxor m2, m2
LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
shl r0d, 16
or r1d, r2d
or r3d, r0d
- shl r3, 32
- or r1, r3
- not r1
+ shl r3, 32
+ or r1, r3
+ not r1
BSR rax, r1, 0x3f
RET
%endif
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
COEFF_LAST
%endif
;-----------------------------------------------------------------------------
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6
%else
DECLARE_REG_TMP 6,3,2,1,4,5,0
pxor m2, m2
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
%if %1==15
- shr t5d, 1
+ shr t5d, 1
%elif %1==8
- and t5d, 0xff
+ and t5d, 0xff
%elif %1==4
- and t5d, 0xf
+ and t5d, 0xf
%endif
- xor t5d, (1<<%1)-1
- mov [t1+4], t5d
+ xor t5d, (1<<%1)-1
+ mov [t1+4], t5d
shl t5d, 32-%1
mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
mov [t1], t4d
.loop:
LZCOUNT t3d, t5d, 0x1f
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
- mov [t1+t6+8+16*4], t3b
- mov [t1+t6*4+ 8], t2d
%else
mov t2w, [t0+t4*2]
- mov [t1+t6+8+16*2], t3b
- mov [t1+t6*2+ 8], t2w
%endif
inc t3d
shl t5d, t3b
+%if HIGH_BIT_DEPTH
+ mov [t1+t6*4+ 8], t2d
+%else
+ mov [t1+t6*2+ 8], t2w
+%endif
inc t6d
sub t4d, t3d
jge .loop
%endmacro
INIT_MMX mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
%endif
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
INIT_XMM sse2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15