+DENOISE_DCT ssse3, 7
+INIT_AVX
+DENOISE_DCT avx, 7
+
+%endif ; !HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; int decimate_score( dctcoef *dct )
+;-----------------------------------------------------------------------------
+
+%macro DECIMATE_MASK_SSE2 7
+%ifdef HIGH_BIT_DEPTH
+ movdqa xmm0, [%3+ 0]
+ movdqa xmm1, [%3+32]
+ packssdw xmm0, [%3+16]
+ packssdw xmm1, [%3+48]
+%if %7
+ pabsw xmm0, xmm0
+ pabsw xmm1, xmm1
+%else
+ ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
+%else
+%if %7
+ pabsw xmm0, [%3+ 0]
+ pabsw xmm1, [%3+16]
+%else
+ movdqa xmm0, [%3+ 0]
+ movdqa xmm1, [%3+16]
+ ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
+%endif
+ packsswb xmm0, xmm1
+ pxor xmm2, xmm2
+ pcmpeqb xmm2, xmm0
+ pcmpgtb xmm0, %4
+ pmovmskb %1, xmm2
+ pmovmskb %2, xmm0
+%endmacro
+
+%macro DECIMATE_MASK_MMX 7
+%ifdef HIGH_BIT_DEPTH
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ movq mm2, [%3+32]
+ movq mm3, [%3+48]
+ packssdw mm0, [%3+ 8]
+ packssdw mm1, [%3+24]
+ packssdw mm2, [%3+40]
+ packssdw mm3, [%3+56]
+%else
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+ 8]
+ movq mm2, [%3+16]
+ movq mm3, [%3+24]
+%endif
+ ABS2_MMX mm0, mm1, mm6, mm7
+ ABS2_MMX mm2, mm3, mm6, mm7
+ packsswb mm0, mm1
+ packsswb mm2, mm3
+ pxor mm4, mm4
+ pxor mm6, mm6
+ pcmpeqb mm4, mm0
+ pcmpeqb mm6, mm2
+ pcmpgtb mm0, %4
+ pcmpgtb mm2, %4
+ pmovmskb %6, mm4
+ pmovmskb %1, mm6
+ shl %1, 8
+ or %1, %6
+ pmovmskb %6, mm0
+ pmovmskb %2, mm2
+ shl %2, 8
+ or %2, %6
+%endmacro
+
+cextern decimate_table4
+cextern decimate_table8
+
+%macro DECIMATE4x4 4
+
+;A LUT is faster than bsf on AMD processors.
+;This is not true for score64.
+cglobal decimate_score%1_%2, 1,3
+%ifdef PIC
+ lea r10, [decimate_table4]
+ lea r11, [decimate_mask_table4]
+ %define table r10
+ %define mask_table r11
+%else
+ %define table decimate_table4
+ %define mask_table decimate_mask_table4
+%endif
+ DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
+ xor edx, 0xffff
+ je .ret
+ test eax, eax
+ jne .ret9
+%if %1==15
+ shr edx, 1
+%endif
+%if %3==1
+ movzx ecx, dl
+ movzx eax, byte [mask_table + rcx]
+ cmp edx, ecx
+ je .ret
+ bsr ecx, ecx
+ shr edx, 1
+ shr edx, cl
+ bsf ecx, edx
+ shr edx, 1
+ shr edx, cl
+ add al, byte [table + rcx]
+ add al, byte [mask_table + rdx]
+%else
+.loop:
+ bsf ecx, edx
+ shr edx, cl
+ add al, byte [table + rcx]
+ shr edx, 1
+ jne .loop
+%endif
+.ret:
+ RET
+.ret9:
+ mov eax, 9
+ RET
+
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+%define DECIMATE_MASK DECIMATE_MASK_MMX
+DECIMATE4x4 15, mmxext, 0, 0
+DECIMATE4x4 16, mmxext, 0, 0
+DECIMATE4x4 15, mmxext_slowctz, 1, 0
+DECIMATE4x4 16, mmxext_slowctz, 1, 0
+%endif
+INIT_XMM
+%define DECIMATE_MASK DECIMATE_MASK_SSE2
+DECIMATE4x4 15, sse2, 0, 0
+DECIMATE4x4 16, sse2, 0, 0
+DECIMATE4x4 15, sse2_slowctz, 1, 0
+DECIMATE4x4 16, sse2_slowctz, 1, 0
+DECIMATE4x4 15, ssse3, 0, 1
+DECIMATE4x4 16, ssse3, 0, 1
+DECIMATE4x4 15, ssse3_slowctz, 1, 1
+DECIMATE4x4 16, ssse3_slowctz, 1, 1
+
+%macro DECIMATE8x8 2
+
+%ifdef ARCH_X86_64
+cglobal decimate_score64_%1, 1,4
+%ifdef PIC
+ lea r10, [decimate_table8]
+ %define table r10
+%else
+ %define table decimate_table8
+%endif
+ mova m5, [pb_1]
+ DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
+ test eax, eax
+ jne .ret9
+ DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
+ shl r2d, 16
+ or r1d, r2d
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
+ shl r2, 32
+ or eax, r3d
+ or r1, r2
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
+ shl r2, 48
+ or r1, r2
+ xor r1, -1
+ je .ret
+ or eax, r3d
+ jne .ret9
+.loop:
+ bsf rcx, r1
+ shr r1, cl
+ add al, byte [table + rcx]
+ shr r1, 1
+ jne .loop
+.ret:
+ REP_RET
+.ret9:
+ mov eax, 9
+ RET
+
+%else ; ARCH
+%ifidn %1, mmxext
+cglobal decimate_score64_%1, 1,6
+%else
+cglobal decimate_score64_%1, 1,5
+%endif
+ mova m5, [pb_1]
+ DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
+ test r2, r2
+ jne .ret9
+ DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
+ shl r4, 16
+ or r3, r4
+ DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
+ or r2, r1
+ DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
+ shl r1, 16
+ or r4, r1
+ xor r3, -1
+ je .tryret
+ xor r4, -1
+.cont:
+ or r0, r2
+ jne .ret9 ;r0 is zero at this point, so we don't need to zero it
+.loop:
+ bsf ecx, r3
+ test r3, r3
+ je .largerun
+ shrd r3, r4, cl
+ shr r4, cl
+ add r0b, byte [decimate_table8 + ecx]
+ shrd r3, r4, 1
+ shr r4, 1
+ cmp r0, 6 ;score64's threshold is never higher than 6
+ jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
+ test r3, r3
+ jne .loop
+ test r4, r4
+ jne .loop
+.ret:
+ REP_RET
+.tryret:
+ xor r4, -1
+ jne .cont
+ REP_RET
+.ret9:
+ mov eax, 9
+ RET
+.largerun:
+ mov r3, r4
+ xor r4, r4
+ bsf ecx, r3
+ shr r3, cl
+ shr r3, 1
+ jne .loop
+ REP_RET
+%endif ; ARCH
+
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+%define DECIMATE_MASK DECIMATE_MASK_MMX
+DECIMATE8x8 mmxext, 0
+%endif
+INIT_XMM
+%define DECIMATE_MASK DECIMATE_MASK_SSE2
+DECIMATE8x8 sse2, 0
+DECIMATE8x8 ssse3, 1
+
+;-----------------------------------------------------------------------------
+; int coeff_last( dctcoef *dct )
+;-----------------------------------------------------------------------------
+
+%macro LAST_X86 3
+ bsr %1, %2
+%endmacro
+
+%macro LAST_SSE4A 3
+ lzcnt %1, %2
+ xor %1, %3
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+%macro LAST_MASK4_MMX 2-3
+ movq mm0, [%2]
+ packssdw mm0, [%2+8]
+ packsswb mm0, mm0
+ pcmpeqb mm0, mm2
+ pmovmskb %1, mm0
+%endmacro
+
+%macro LAST_MASK_SSE2 2-3
+ movdqa xmm0, [%2+ 0]
+ movdqa xmm1, [%2+32]
+ packssdw xmm0, [%2+16]
+ packssdw xmm1, [%2+48]
+ packsswb xmm0, xmm1
+ pcmpeqb xmm0, xmm2
+ pmovmskb %1, xmm0
+%endmacro
+
+%macro LAST_MASK_MMX 3
+ movq mm0, [%2+ 0]
+ movq mm1, [%2+16]
+ packssdw mm0, [%2+ 8]
+ packssdw mm1, [%2+24]
+ movq mm3, [%2+32]
+ movq mm4, [%2+48]
+ packssdw mm3, [%2+40]
+ packssdw mm4, [%2+56]
+ packsswb mm0, mm1
+ packsswb mm3, mm4
+ pcmpeqb mm0, mm2
+ pcmpeqb mm3, mm2
+ pmovmskb %1, mm0
+ pmovmskb %3, mm3
+ shl %3, 8
+ or %1, %3
+%endmacro
+
+%macro COEFF_LAST4 1
+cglobal coeff_last4_%1, 1,3
+ pxor mm2, mm2
+ LAST_MASK4_MMX r1d, r0
+ xor r1d, 0xff
+ shr r1d, 4
+ LAST eax, r1d, 0x1f
+ RET
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST4 mmxext
+%define LAST LAST_SSE4A
+COEFF_LAST4 mmxext_lzcnt
+
+%else ; !HIGH_BIT_DEPTH
+%macro LAST_MASK4_MMX 2-3
+ movq mm0, [%2]
+ packsswb mm0, mm0
+ pcmpeqb mm0, mm2
+ pmovmskb %1, mm0
+%endmacro
+
+%macro LAST_MASK_SSE2 2-3
+ movdqa xmm0, [%2+ 0]
+ packsswb xmm0, [%2+16]
+ pcmpeqb xmm0, xmm2
+ pmovmskb %1, xmm0
+%endmacro
+
+%macro LAST_MASK_MMX 3
+ movq mm0, [%2+ 0]
+ movq mm1, [%2+16]
+ packsswb mm0, [%2+ 8]
+ packsswb mm1, [%2+24]
+ pcmpeqb mm0, mm2
+ pcmpeqb mm1, mm2
+ pmovmskb %1, mm0
+ pmovmskb %3, mm1
+ shl %3, 8
+ or %1, %3
+%endmacro
+
+%macro COEFF_LAST4 1
+%ifdef ARCH_X86_64
+cglobal coeff_last4_%1, 1,1
+ LAST rax, [r0], 0x3f
+ shr eax, 4
+ RET
+%else
+cglobal coeff_last4_%1, 0,3
+ mov edx, r0mp
+ mov eax, [edx+4]
+ xor ecx, ecx
+ test eax, eax
+ cmovz eax, [edx]
+ setnz cl
+ LAST eax, eax, 0x1f
+ shr eax, 4
+ lea eax, [eax+ecx*2]
+ RET
+%endif
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST4 mmxext
+%define LAST LAST_SSE4A
+COEFF_LAST4 mmxext_lzcnt
+%endif ; HIGH_BIT_DEPTH
+
+%macro COEFF_LAST 1
+cglobal coeff_last15_%1, 1,3
+ pxor m2, m2
+ LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
+ xor r1d, 0xffff
+ LAST eax, r1d, 0x1f
+ dec eax
+ RET
+
+cglobal coeff_last16_%1, 1,3
+ pxor m2, m2
+ LAST_MASK r1d, r0, r2d
+ xor r1d, 0xffff
+ LAST eax, r1d, 0x1f
+ RET
+
+%ifndef ARCH_X86_64
+cglobal coeff_last64_%1, 1, 5-mmsize/16
+ pxor m2, m2
+ LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
+ LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
+ shl r3d, 16
+ or r2d, r3d
+ xor r2d, -1
+ jne .secondhalf
+ LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
+ LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
+ shl r3d, 16
+ or r1d, r3d
+ not r1d
+ LAST eax, r1d, 0x1f
+ RET
+.secondhalf:
+ LAST eax, r2d, 0x1f
+ add eax, 32
+ RET
+%else
+cglobal coeff_last64_%1, 1,4
+ pxor m2, m2
+ LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
+ LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
+ LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
+ LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
+ shl r2d, 16
+ shl r0d, 16
+ or r1d, r2d
+ or r3d, r0d
+ shl r3, 32
+ or r1, r3
+ not r1
+ LAST rax, r1, 0x3f
+ RET
+%endif
+%endmacro
+
+%define LAST LAST_X86
+%ifndef ARCH_X86_64
+INIT_MMX
+%define LAST_MASK LAST_MASK_MMX
+COEFF_LAST mmxext
+%endif
+INIT_XMM
+%define LAST_MASK LAST_MASK_SSE2
+COEFF_LAST sse2
+%define LAST LAST_SSE4A
+COEFF_LAST sse2_lzcnt
+
+;-----------------------------------------------------------------------------
+; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
+;-----------------------------------------------------------------------------
+
+%macro LZCOUNT_X86 3
+ bsr %1, %2
+ xor %1, %3
+%endmacro
+
+%macro LZCOUNT_SSE4A 3
+ lzcnt %1, %2
+%endmacro
+
+; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
+%ifdef WIN64
+ DECLARE_REG_TMP 3,1,2,0,4,5,6
+%elifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,4,5,6
+%else
+ DECLARE_REG_TMP 6,3,2,1,4,5,0
+%endif
+
+%macro COEFF_LEVELRUN 2
+cglobal coeff_level_run%2_%1,0,7
+ movifnidn t0, r0mp
+ movifnidn t1, r1mp
+ pxor m2, m2
+ LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
+ not t5d
+ shl t5d, 32-((%2+1)&~1)
+ mov t4d, %2-1
+ LZCOUNT t3d, t5d, 0x1f
+ xor t6d, t6d
+ add t5d, t5d
+ sub t4d, t3d
+ shl t5d, t3b
+ mov [t1], t4d
+.loop:
+ LZCOUNT t3d, t5d, 0x1f
+%ifdef HIGH_BIT_DEPTH
+ mov t2d, [t0+t4*4]
+ mov [t1+t6 +4+16*4], t3b
+ mov [t1+t6*4+ 4], t2d
+%else
+ mov t2w, [t0+t4*2]
+ mov [t1+t6 +4+16*2], t3b
+ mov [t1+t6*2+ 4], t2w
+%endif
+ inc t3d
+ shl t5d, t3b
+ inc t6d
+ sub t4d, t3d
+ jge .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define LZCOUNT LZCOUNT_X86
+%ifndef ARCH_X86_64
+%define LAST_MASK LAST_MASK_MMX
+COEFF_LEVELRUN mmxext, 15
+COEFF_LEVELRUN mmxext, 16
+%endif
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext, 4
+INIT_XMM
+%define LAST_MASK LAST_MASK_SSE2
+COEFF_LEVELRUN sse2, 15
+COEFF_LEVELRUN sse2, 16
+%define LZCOUNT LZCOUNT_SSE4A
+COEFF_LEVELRUN sse2_lzcnt, 15
+COEFF_LEVELRUN sse2_lzcnt, 16
+INIT_MMX
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext_lzcnt, 4