;*****************************************************************************
;* quant-a.asm: x86 quantization and level-run
;*****************************************************************************
-;* Copyright (C) 2005-2010 x264 project
+;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;* Christian Heine <sennindemokrit@gmx.net>
+;* Oskar Arvidsson <oskar@irock.se>
+;* Henrik Gramner <hengar-6@student.ltu.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
+chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
+chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
+chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
+chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
+
SECTION .text
cextern pb_1
cextern pw_1
cextern pd_1
cextern pb_01
+cextern pd_1024
%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
movd m7, r2m ; bias
-%ifidn m0, mm0
- pshufw m6, m6, 0
- pshufw m7, m7, 0
+%ifdef HIGH_BIT_DEPTH
+ SPLATD m6, m6
+ SPLATD m7, m7
%else
- pshuflw m6, m6, 0
- pshuflw m7, m7, 0
- punpcklqdq m6, m6
- punpcklqdq m7, m7
-%endif
+ SPLATW m6, m6
+ SPLATW m7, m7
+%endif ; HIGH_BIT_DEPTH
%endmacro
%macro QUANT_DC_START_SSSE3 0
psignw %1, %2
%endmacro
+%macro PSIGND_MMX 2-3
+%if %0==3
+ mova %1, %2
+ pxor %1, %3
+ psubd %1, %3
+%else
+ pxor %1, %2
+ psubd %1, %2
+%endif
+%endmacro
+
+%macro PSIGND_SSSE3 2+
+ psignd %1, %2
+%endmacro
+
+%macro PABSD_MMX 2
+ pxor %1, %1
+ pcmpgtd %1, %2
+ pxor %2, %1
+ psubd %2, %1
+ SWAP %1, %2
+%endmacro
+
+%macro PABSD_SSSE3 2
+ pabsd %1, %2
+%endmacro
+
+%macro QUANT_END_MMX 0
+ xor eax, eax
+%ifdef ARCH_X86_64
+%if mmsize == 16
+ packsswb m5, m5
+%endif
+ movq rcx, m5
+ test rcx, rcx
+%else
+%if mmsize == 16
+ pxor m4, m4
+ pcmpeqb m5, m4
+ pmovmskb ecx, m5
+ cmp ecx, (1<<mmsize)-1
+%else
+ packsswb m5, m5
+ movd ecx, m5
+ test ecx, ecx
+%endif
+%endif
+ setne al
+%endmacro
+
+%macro QUANT_END_SSE4 0
+ xor eax, eax
+ ptest m5, m5
+ setne al
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+%macro QUANT_ONE_DC_MMX 4
+ mova m0, [%1]
+ PABSD m1, m0
+ paddd m1, %3
+ mova m2, m1
+ psrlq m2, 32
+ pmuludq m1, %2
+ pmuludq m2, %2
+ psllq m2, 32
+ paddd m1, m2
+ psrld m1, 16
+ PSIGND m1, m0
+ mova [%1], m1
+%if %4
+ por m5, m1
+%else
+ SWAP 5, 1
+%endif
+%endmacro
+
+%macro QUANT_TWO_DC_MMX 4
+ QUANT_ONE_DC_MMX %1, %2, %3, %4
+ QUANT_ONE_DC_MMX %1+mmsize, %2, %3, %4+mmsize
+%endmacro
+
+%macro QUANT_ONE_DC_SSE4 4
+ mova m0, [%1]
+ PABSD m1, m0
+ paddd m1, %3
+ pmulld m1, %2
+ psrad m1, 16
+ PSIGND m1, m0
+ mova [%1], m1
+%if %4
+ por m5, m1
+%else
+ SWAP 5, 1
+%endif
+%endmacro
+
+%macro QUANT_TWO_DC_SSE4 4
+ mova m0, [%1]
+ mova m1, [%1+mmsize]
+ PABSD m2, m0
+ PABSD m3, m1
+ paddd m2, %3
+ paddd m3, %3
+ pmulld m2, %2
+ pmulld m3, %2
+ psrad m2, 16
+ psrad m3, 16
+ PSIGND m2, m0
+ PSIGND m3, m1
+ mova [%1], m2
+ mova [%1+mmsize], m3
+%if %4
+ por m5, m2
+%else
+ SWAP 5, 2
+%endif
+ por m5, m3
+%endmacro
+
+%macro QUANT_ONE_AC_MMX 4
+ mova m0, [%1]
+ mova m2, [%2]
+ PABSD m1, m0
+ mova m4, m2
+ paddd m1, [%3]
+ mova m3, m1
+ psrlq m4, 32
+ psrlq m3, 32
+ pmuludq m1, m2
+ pmuludq m3, m4
+ psllq m3, 32
+ paddd m1, m3
+ psrad m1, 16
+ PSIGND m1, m0
+ mova [%1], m1
+%if %4
+ por m5, m1
+%else
+ SWAP 5, 1
+%endif
+%endmacro
+
+%macro QUANT_TWO_AC_MMX 4
+ QUANT_ONE_AC_MMX %1, %2, %3, %4
+ QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
+%endmacro
+
+%macro QUANT_TWO_AC_SSE4 4
+ mova m0, [%1]
+ mova m1, [%1+mmsize]
+ PABSD m2, m0
+ PABSD m3, m1
+ paddd m2, [%3]
+ paddd m3, [%3+mmsize]
+ pmulld m2, [%2]
+ pmulld m3, [%2+mmsize]
+ psrad m2, 16
+ psrad m3, 16
+ PSIGND m2, m0
+ PSIGND m3, m1
+ mova [%1], m2
+ mova [%1+mmsize], m3
+%if %4
+ por m5, m2
+%else
+ SWAP 5, 2
+%endif
+ por m5, m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int quant_2x2( int32_t dct[M*N], int mf, int bias )
+;-----------------------------------------------------------------------------
+%macro QUANT_DC 3
+cglobal quant_%1x%2_dc_%3, 3,3,8*(mmsize/16)
+ QUANT_DC_START_MMX
+%if %1*%2 <= mmsize/4
+ QUANT_ONE_DC r0, m6, m7, 0
+%else
+%assign x 0
+%rep %1*%2/(mmsize/2)
+ QUANT_TWO_DC r0+x, m6, m7, x
+%assign x x+mmsize*2
+%endrep
+%endif
+ QUANT_END
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
+;-----------------------------------------------------------------------------
+%macro QUANT_AC 3
+cglobal quant_%1x%2_%3, 3,3,8*(mmsize/16)
+%assign x 0
+%rep %1*%2/(mmsize/2)
+ QUANT_TWO_AC r0+x, r1+x, r2+x, x
+%assign x x+mmsize*2
+%endrep
+ QUANT_END
+ RET
+%endmacro
+
+%define QUANT_TWO_AC QUANT_TWO_AC_MMX
+%define QUANT_ONE_DC QUANT_ONE_DC_MMX
+%define QUANT_TWO_DC QUANT_TWO_DC_MMX
+%define QUANT_END QUANT_END_MMX
+%define PABSD PABSD_MMX
+%define PSIGND PSIGND_MMX
+INIT_XMM
+QUANT_DC 2, 2, sse2
+QUANT_DC 4, 4, sse2
+QUANT_AC 4, 4, sse2
+QUANT_AC 8, 8, sse2
+
+%define PABSD PABSD_SSSE3
+%define PSIGND PSIGND_SSSE3
+QUANT_DC 2, 2, ssse3
+QUANT_DC 4, 4, ssse3
+QUANT_AC 4, 4, ssse3
+QUANT_AC 8, 8, ssse3
+
+%define QUANT_TWO_AC QUANT_TWO_AC_SSE4
+%define QUANT_ONE_DC QUANT_ONE_DC_SSE4
+%define QUANT_TWO_DC QUANT_TWO_DC_SSE4
+%define QUANT_END QUANT_END_SSE4
+QUANT_DC 2, 2, sse4
+QUANT_DC 4, 4, sse4
+QUANT_AC 4, 4, sse4
+QUANT_AC 8, 8, sse4
+
+%undef SIGND
+%undef PABSD
+%undef QUANT_END
+%undef QUANT_TWO_AC
+%undef QUANT_ONE_DC
+%undef QUANT_TWO_DC
+%endif ; HIGH_BIT_DEPTH
+
+%ifndef HIGH_BIT_DEPTH
%macro QUANT_ONE 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
%if %4
por m5, m0
%else
- SWAP m5, m0
+ SWAP 5, 0
%endif
%endmacro
por m5, m0
por m5, m2
%else
- SWAP m5, m0
+ SWAP 5, 0
por m5, m2
%endif
%endmacro
-%macro QUANT_END_MMX 0
- xor eax, eax
-%ifndef ARCH_X86_64
-%if mmsize==8
- packsswb m5, m5
- movd ecx, m5
- test ecx, ecx
-%else
- pxor m4, m4
- pcmpeqb m5, m4
- pmovmskb ecx, m5
- cmp ecx, (1<<mmsize)-1
-%endif
-%else
-%if mmsize==16
- packsswb m5, m5
-%endif
- movq rcx, m5
- test rcx, rcx
-%endif
- setne al
-%endmacro
-
-%macro QUANT_END_SSE4 0
- xor eax, eax
- ptest m5, m5
- setne al
-%endmacro
-
;-----------------------------------------------------------------------------
; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
QUANT_DC quant_4x4_dc_sse4, 2, 8
QUANT_AC quant_4x4_sse4, 2
QUANT_AC quant_8x8_sse4, 8
+%endif ; !HIGH_BIT_DEPTH
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
-
mova m0, %2
+%ifdef HIGH_BIT_DEPTH
+ pmaddwd m0, %1
+ pslld m0, m2
+%else
packssdw m0, %3
pmullw m0, %1
psllw m0, m2
+%endif
mova %1, m0
%endmacro
;;; m2 -i_qbits
;;; m3 f
;;; m4 0
-
mova m0, %1
- mova m1, m0
+%ifdef HIGH_BIT_DEPTH
+ pmaddwd m0, %2
+ paddd m0, m3
+ psrad m0, m2
+%else
+ punpckhwd m1, m0, m4
punpcklwd m0, m4
- punpckhwd m1, m4
pmaddwd m0, %2
pmaddwd m1, %3
paddd m0, m3
psrad m0, m2
psrad m1, m2
packssdw m0, m1
+%endif
mova %1, m0
%endmacro
%if 8*(%2-2*%3)
mov t0d, 8*(%2-2*%3)
%%loop:
- %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
- %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
sub t0d, 16*%3
jge %%loop
REP_RET
%else
- %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
- %1 [r0 ], [r1 ], [r1+ 8*%3]
+ %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
+ %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
RET
%endif
%endmacro
%endmacro
;-----------------------------------------------------------------------------
-; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
-cglobal dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
.skip_prologue:
DEQUANT_START %3+2, %3
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
+%ifnidn %1, avx
cglobal dequant_%2x%2_flat16_%1, 0,3
movifnidn t2d, r2m
%if %2 == 8
DEQUANT16_FLAT [r1+32], 32, 96
%endif
RET
+%endif ; !AVX
%endmacro ; DEQUANT
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT sse2, 4, 4, 1
+DEQUANT sse4, 4, 4, 1
+DEQUANT sse2, 8, 6, 1
+DEQUANT sse4, 8, 6, 1
+%else
%ifndef ARCH_X86_64
INIT_MMX
DEQUANT mmx, 4, 4, 1
INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
+INIT_AVX
+DEQUANT avx, 4, 4, 2
+DEQUANT avx, 8, 6, 2
+%endif
-%macro DEQUANT_DC 1
-cglobal dequant_4x4dc_%1, 0,3
+%macro DEQUANT_DC 2
+cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
DEQUANT_START 6, 6
.lshift:
- movd m3, [r1]
- movd m2, t0d
- pslld m3, m2
+ movd m3, [r1]
+ movd m2, t0d
+ pslld m3, m2
+%ifdef HIGH_BIT_DEPTH
+ pshufd m3, m3, 0
+%assign x 0
+%rep SIZEOF_PIXEL*16/mmsize
+ mova m0, [r0+mmsize*0+x]
+ mova m1, [r0+mmsize*1+x]
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ mova [r0+mmsize*0+x], m0
+ mova [r0+mmsize*1+x], m1
+%assign x x+mmsize*2
+%endrep
+
+%else ; !HIGH_BIT_DEPTH
%if mmsize==16
pshuflw m3, m3, 0
punpcklqdq m3, m3
pshufw m3, m3, 0
%endif
%assign x 0
-%rep 16/mmsize
+%rep SIZEOF_PIXEL*16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
pmullw m0, m3
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
+%endif ; HIGH_BIT_DEPTH
RET
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [pw_1]
+ mova m4, [p%2_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
movd m2, [r1]
+%assign x 0
+%ifdef HIGH_BIT_DEPTH
+ pshufd m2, m2, 0
+%rep SIZEOF_PIXEL*32/mmsize
+ mova m0, [r0+x]
+ pmaddwd m0, m2
+ paddd m0, m4
+ psrad m0, m3
+ mova [r0+x], m0
+%assign x x+mmsize
+%endrep
+
+%else
%if mmsize==8
punpcklwd m2, m2
%else
pshuflw m2, m2, 0
%endif
punpcklwd m2, m4
-%assign x 0
-%rep 32/mmsize
+%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
- mova m1, m0
+ punpckhwd m1, m0, m5
punpcklwd m0, m5
- punpckhwd m1, m5
pmaddwd m0, m2
pmaddwd m1, m2
psrad m0, m3
mova [r0+x], m0
%assign x x+mmsize
%endrep
+%endif
RET
%endmacro
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT_DC sse2 , d
+DEQUANT_DC sse4 , d
+INIT_AVX
+DEQUANT_DC avx , d
+%else
+INIT_MMX
+DEQUANT_DC mmxext, w
+INIT_XMM
+DEQUANT_DC sse2 , w
+INIT_AVX
+DEQUANT_DC avx , w
+%endif
+
+; t4 is eax for return value.
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
+%else
+ DECLARE_REG_TMP 4,1,2,3,0,5
+%endif
+
+;-----------------------------------------------------------------------------
+; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+;-----------------------------------------------------------------------------
+
+; %2 == 1 for sse2 or ssse3, 0 for sse4/avx
+%macro OPTIMIZE_CHROMA_DC 2
+%assign %%regs 4+%2
+%ifndef ARCH_X86_64
+ %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
+%endif
+cglobal optimize_chroma_dc_%1, 0,%%regs,7
+ movifnidn t0, r0mp
+ movd m2, r1m
+ movq m1, [t0]
+%if %2
+ pxor m4, m4
+%else ; sse4, avx
+ pcmpeqb m4, m4
+ pslld m4, 11
+%endif
+%ifidn %1, sse2
+ mova m3, [chroma_dc_dct_mask_mmx]
+ mova m5, [chroma_dc_dmf_mask_mmx]
+%else
+ mova m3, [chroma_dc_dct_mask]
+ mova m5, [chroma_dc_dmf_mask]
+%endif
+ pshuflw m2, m2, 0
+ pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2
+ punpcklqdq m2, m2
+ punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
+ mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
+ PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
+ PSIGNW m2, m5 ; + - - + - - + +
+ paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
+ pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
+ punpcklwd m1, m1
+ psrad m2, 16 ; + - - +
+ mov t1d, 3
+ paddd m0, m6
+ xor t4d, t4d
+%ifidn %1, sse2
+ psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
+%endif
+%if %2
+ mova m6, m0
+ SWAP 0, 6
+ psrad m6, 11
+ pcmpeqd m6, m4
+ pmovmskb t5d, m6
+ cmp t5d, 0xffff
+%else ; sse4, avx
+ ptest m0, m4
+%endif
+ jz .ret ; if the DC coefficients already round to zero, terminate early
+ mova m3, m0
+.outer_loop:
+ movsx t3d, word [t0+2*t1] ; dct[coeff]
+ pshufd m6, m1, 11111111b
+ pshufd m1, m1, 10010000b ; move the next element to high dword
+ PSIGND m5, m2, m6
+ test t3d, t3d
+ jz .loop_end
+.outer_loop_0:
+ mov t2d, t3d
+ sar t3d, 31
+ or t3d, 1
+.inner_loop:
+ psubd m3, m5 ; coeff -= sign
+ pxor m6, m0, m3
+%if %2
+ psrad m6, 11
+ pcmpeqd m6, m4
+ pmovmskb t5d, m6
+ cmp t5d, 0xffff
+%else ; sse4, avx
+ ptest m6, m4
+%endif
+ jz .round_coeff
+ paddd m3, m5 ; coeff += sign
+ mov t4d, 1
+.loop_end:
+ dec t1d
+ jz .last_coeff
+ pshufd m2, m2, 01111000b ; - + - + / - - + +
+ jg .outer_loop
+.ret:
+ REP_RET
+.round_coeff:
+ sub t2d, t3d
+ mov [t0+2*t1], t2w
+ jnz .inner_loop
+ jmp .loop_end
+.last_coeff:
+ movsx t3d, word [t0]
+ punpcklqdq m2, m2 ; + + + +
+ PSIGND m5, m2, m1
+ test t3d, t3d
+ jnz .outer_loop_0
+ REP_RET
+%endmacro
+
+INIT_XMM
+%define PSIGNW PSIGNW_MMX
+%define PSIGND PSIGND_MMX
+OPTIMIZE_CHROMA_DC sse2, 1
+%define PSIGNW PSIGNW_SSSE3
+%define PSIGND PSIGND_SSSE3
+OPTIMIZE_CHROMA_DC ssse3, 1
+OPTIMIZE_CHROMA_DC sse4, 0
+INIT_AVX
+OPTIMIZE_CHROMA_DC avx, 0
+
+%ifdef HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 1-2 0
+cglobal denoise_dct_%1, 4,4,%2
+ pxor m6, m6
+.loop:
+ sub r3, mmsize/2
+ mova m2, [r0+r3*4+0*mmsize]
+ mova m3, [r0+r3*4+1*mmsize]
+ PABSD m0, m2
+ PABSD m1, m3
+ mova m4, m0
+ mova m5, m1
+ psubd m0, [r2+r3*4+0*mmsize]
+ psubd m1, [r2+r3*4+1*mmsize]
+ pcmpgtd m7, m0, m6
+ pand m0, m7
+ pcmpgtd m7, m1, m6
+ pand m1, m7
+ PSIGND m0, m2
+ PSIGND m1, m3
+ mova [r0+r3*4+0*mmsize], m0
+ mova [r0+r3*4+1*mmsize], m1
+ paddd m4, [r1+r3*4+0*mmsize]
+ paddd m5, [r1+r3*4+1*mmsize]
+ mova [r1+r3*4+0*mmsize], m4
+ mova [r1+r3*4+1*mmsize], m5
+ jg .loop
+ REP_RET
+%endmacro
+
+%define PABSD PABSD_MMX
+%define PSIGND PSIGND_MMX
+%ifndef ARCH_X86_64
INIT_MMX
-DEQUANT_DC mmxext
+DENOISE_DCT mmx
+%endif
INIT_XMM
-DEQUANT_DC sse2
+DENOISE_DCT sse2, 8
+%define PABSD PABSD_SSSE3
+%define PSIGND PSIGND_SSSE3
+DENOISE_DCT ssse3, 8
+INIT_AVX
+DENOISE_DCT avx , 8
+
+%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,5,%2
- movzx r4d, word [r0] ; backup DC coefficient
+cglobal denoise_dct_%1, 4,4,%2
pxor m6, m6
.loop:
sub r3, mmsize
mova m3, [r0+r3*2+1*mmsize]
PABSW m0, m2
PABSW m1, m3
- mova m4, m0
- mova m5, m1
- psubusw m0, [r2+r3*2+0*mmsize]
- psubusw m1, [r2+r3*2+1*mmsize]
- PSIGNW m0, m2
- PSIGNW m1, m3
- mova [r0+r3*2+0*mmsize], m0
- mova [r0+r3*2+1*mmsize], m1
- mova m2, m4
- mova m3, m5
- punpcklwd m4, m6
- punpckhwd m2, m6
- punpcklwd m5, m6
- punpckhwd m3, m6
- paddd m4, [r1+r3*4+0*mmsize]
- paddd m2, [r1+r3*4+1*mmsize]
- paddd m5, [r1+r3*4+2*mmsize]
- paddd m3, [r1+r3*4+3*mmsize]
- mova [r1+r3*4+0*mmsize], m4
- mova [r1+r3*4+1*mmsize], m2
- mova [r1+r3*4+2*mmsize], m5
- mova [r1+r3*4+3*mmsize], m3
+ psubusw m4, m0, [r2+r3*2+0*mmsize]
+ psubusw m5, m1, [r2+r3*2+1*mmsize]
+ PSIGNW m4, m2
+ PSIGNW m5, m3
+ mova [r0+r3*2+0*mmsize], m4
+ mova [r0+r3*2+1*mmsize], m5
+ punpcklwd m2, m0, m6
+ punpcklwd m3, m1, m6
+ punpckhwd m0, m6
+ punpckhwd m1, m6
+ paddd m2, [r1+r3*4+0*mmsize]
+ paddd m0, [r1+r3*4+1*mmsize]
+ paddd m3, [r1+r3*4+2*mmsize]
+ paddd m1, [r1+r3*4+3*mmsize]
+ mova [r1+r3*4+0*mmsize], m2
+ mova [r1+r3*4+1*mmsize], m0
+ mova [r1+r3*4+2*mmsize], m3
+ mova [r1+r3*4+3*mmsize], m1
jg .loop
- mov [r0], r4w ; restore DC coefficient
- RET
+ REP_RET
%endmacro
%define PABSW PABSW_MMX
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3, 7
+INIT_AVX
+DENOISE_DCT avx, 7
-
+%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; int decimate_score( int16_t *dct )
+; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------
-%macro DECIMATE_MASK_SSE2 6
-%ifidn %5, ssse3
+%macro DECIMATE_MASK_SSE2 7
+%ifdef HIGH_BIT_DEPTH
+ movdqa xmm0, [%3+ 0]
+ movdqa xmm1, [%3+32]
+ packssdw xmm0, [%3+16]
+ packssdw xmm1, [%3+48]
+%if %7
+ pabsw xmm0, xmm0
+ pabsw xmm1, xmm1
+%else
+ ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
+%else
+%if %7
pabsw xmm0, [%3+ 0]
pabsw xmm1, [%3+16]
%else
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+16]
ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
%endif
packsswb xmm0, xmm1
pxor xmm2, xmm2
pmovmskb %2, xmm0
%endmacro
-%macro DECIMATE_MASK_MMX 6
+%macro DECIMATE_MASK_MMX 7
+%ifdef HIGH_BIT_DEPTH
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ movq mm2, [%3+32]
+ movq mm3, [%3+48]
+ packssdw mm0, [%3+ 8]
+ packssdw mm1, [%3+24]
+ packssdw mm2, [%3+40]
+ packssdw mm3, [%3+56]
+%else
movq mm0, [%3+ 0]
movq mm1, [%3+ 8]
movq mm2, [%3+16]
movq mm3, [%3+24]
- ABS2_MMX mm0, mm1, mm4, mm5
- ABS2_MMX mm2, mm3, mm4, mm5
+%endif
+ ABS2_MMX mm0, mm1, mm6, mm7
+ ABS2_MMX mm2, mm3, mm6, mm7
packsswb mm0, mm1
packsswb mm2, mm3
pxor mm4, mm4
- pxor mm5, mm5
+ pxor mm6, mm6
pcmpeqb mm4, mm0
- pcmpeqb mm5, mm2
+ pcmpeqb mm6, mm2
pcmpgtb mm0, %4
pcmpgtb mm2, %4
pmovmskb %6, mm4
- pmovmskb %1, mm5
+ pmovmskb %1, mm6
shl %1, 8
or %1, %6
pmovmskb %6, mm0
cextern decimate_table4
cextern decimate_table8
-%macro DECIMATE4x4 3
+%macro DECIMATE4x4 4
;A LUT is faster than bsf on AMD processors.
;This is not true for score64.
%define table decimate_table4
%define mask_table decimate_mask_table4
%endif
- DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
+ DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
xor edx, 0xffff
je .ret
test eax, eax
%endmacro
%ifndef ARCH_X86_64
+INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext, 0
-DECIMATE4x4 16, mmxext, 0
-DECIMATE4x4 15, mmxext_slowctz, 1
-DECIMATE4x4 16, mmxext_slowctz, 1
+DECIMATE4x4 15, mmxext, 0, 0
+DECIMATE4x4 16, mmxext, 0, 0
+DECIMATE4x4 15, mmxext_slowctz, 1, 0
+DECIMATE4x4 16, mmxext_slowctz, 1, 0
%endif
+INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2, 0
-DECIMATE4x4 16, sse2, 0
-DECIMATE4x4 15, sse2_slowctz, 1
-DECIMATE4x4 16, sse2_slowctz, 1
-DECIMATE4x4 15, ssse3, 0
-DECIMATE4x4 16, ssse3, 0
-DECIMATE4x4 15, ssse3_slowctz, 1
-DECIMATE4x4 16, ssse3_slowctz, 1
+DECIMATE4x4 15, sse2, 0, 0
+DECIMATE4x4 16, sse2, 0, 0
+DECIMATE4x4 15, sse2_slowctz, 1, 0
+DECIMATE4x4 16, sse2_slowctz, 1, 0
+DECIMATE4x4 15, ssse3, 0, 1
+DECIMATE4x4 16, ssse3, 0, 1
+DECIMATE4x4 15, ssse3_slowctz, 1, 1
+DECIMATE4x4 16, ssse3_slowctz, 1, 1
-%macro DECIMATE8x8 1
+%macro DECIMATE8x8 2
%ifdef ARCH_X86_64
cglobal decimate_score64_%1, 1,4
%define table decimate_table8
%endif
mova m5, [pb_1]
- DECIMATE_MASK r1d, eax, r0, m5, %1, null
+ DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
test eax, eax
jne .ret9
- DECIMATE_MASK r2d, eax, r0+32, m5, %1, null
+ DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
shl r2d, 16
or r1d, r2d
- DECIMATE_MASK r2d, r3d, r0+64, m5, %1, null
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
shl r2, 32
or eax, r3d
or r1, r2
- DECIMATE_MASK r2d, r3d, r0+96, m5, %1, null
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
shl r2, 48
or r1, r2
xor r1, -1
%else
cglobal decimate_score64_%1, 1,5
%endif
- mova m7, [pb_1]
- DECIMATE_MASK r3, r2, r0, m7, %1, r5
+ mova m5, [pb_1]
+ DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
test r2, r2
jne .ret9
- DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
+ DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
shl r4, 16
or r3, r4
- DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
+ DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
or r2, r1
- DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
+ DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
shl r1, 16
or r4, r1
xor r3, -1
%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE8x8 mmxext
+DECIMATE8x8 mmxext, 0
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE8x8 sse2
-DECIMATE8x8 ssse3
+DECIMATE8x8 sse2, 0
+DECIMATE8x8 ssse3, 1
;-----------------------------------------------------------------------------
-; int coeff_last( int16_t *dct )
+; int coeff_last( dctcoef *dct )
;-----------------------------------------------------------------------------
+%macro LAST_X86 3
+ bsr %1, %2
+%endmacro
+
+%macro LAST_SSE4A 3
+ lzcnt %1, %2
+ xor %1, %3
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+%macro LAST_MASK4_MMX 2-3
+ movq mm0, [%2]
+ packssdw mm0, [%2+8]
+ packsswb mm0, mm0
+ pcmpeqb mm0, mm2
+ pmovmskb %1, mm0
+%endmacro
+
+%macro LAST_MASK_SSE2 2-3
+ movdqa xmm0, [%2+ 0]
+ movdqa xmm1, [%2+32]
+ packssdw xmm0, [%2+16]
+ packssdw xmm1, [%2+48]
+ packsswb xmm0, xmm1
+ pcmpeqb xmm0, xmm2
+ pmovmskb %1, xmm0
+%endmacro
+
+%macro LAST_MASK_MMX 3
+ movq mm0, [%2+ 0]
+ movq mm1, [%2+16]
+ packssdw mm0, [%2+ 8]
+ packssdw mm1, [%2+24]
+ movq mm3, [%2+32]
+ movq mm4, [%2+48]
+ packssdw mm3, [%2+40]
+ packssdw mm4, [%2+56]
+ packsswb mm0, mm1
+ packsswb mm3, mm4
+ pcmpeqb mm0, mm2
+ pcmpeqb mm3, mm2
+ pmovmskb %1, mm0
+ pmovmskb %3, mm3
+ shl %3, 8
+ or %1, %3
+%endmacro
+
+%macro COEFF_LAST4 1
+cglobal coeff_last4_%1, 1,3
+ pxor mm2, mm2
+ LAST_MASK4_MMX r1d, r0
+ xor r1d, 0xff
+ shr r1d, 4
+ LAST eax, r1d, 0x1f
+ RET
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST4 mmxext
+%define LAST LAST_SSE4A
+COEFF_LAST4 mmxext_lzcnt
+
+%else ; !HIGH_BIT_DEPTH
+%macro LAST_MASK4_MMX 2-3
+ movq mm0, [%2]
+ packsswb mm0, mm0
+ pcmpeqb mm0, mm2
+ pmovmskb %1, mm0
+%endmacro
+
%macro LAST_MASK_SSE2 2-3
movdqa xmm0, [%2+ 0]
packsswb xmm0, [%2+16]
or %1, %3
%endmacro
-%macro LAST_X86 3
- bsr %1, %2
-%endmacro
-
-%macro LAST_SSE4A 3
- lzcnt %1, %2
- xor %1, %3
-%endmacro
-
%macro COEFF_LAST4 1
%ifdef ARCH_X86_64
cglobal coeff_last4_%1, 1,1
LAST rax, [r0], 0x3f
- shr eax, 4
+ shr eax, 4
RET
%else
cglobal coeff_last4_%1, 0,3
COEFF_LAST4 mmxext
%define LAST LAST_SSE4A
COEFF_LAST4 mmxext_lzcnt
+%endif ; HIGH_BIT_DEPTH
%macro COEFF_LAST 1
cglobal coeff_last15_%1, 1,3
pxor m2, m2
- LAST_MASK r1d, r0-2, r2d
+ LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
xor r1d, 0xffff
LAST eax, r1d, 0x1f
dec eax
%ifndef ARCH_X86_64
cglobal coeff_last64_%1, 1, 5-mmsize/16
pxor m2, m2
- LAST_MASK r2d, r0+64, r4d
- LAST_MASK r3d, r0+96, r4d
+ LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
+ LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
shl r3d, 16
or r2d, r3d
xor r2d, -1
jne .secondhalf
- LAST_MASK r1d, r0, r4d
- LAST_MASK r3d, r0+32, r4d
+ LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
+ LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
shl r3d, 16
or r1d, r3d
not r1d
%else
cglobal coeff_last64_%1, 1,4
pxor m2, m2
- LAST_MASK_SSE2 r1d, r0
- LAST_MASK_SSE2 r2d, r0+32
- LAST_MASK_SSE2 r3d, r0+64
- LAST_MASK_SSE2 r0d, r0+96
+ LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
+ LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
+ LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
+ LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
shl r2d, 16
shl r0d, 16
or r1d, r2d
COEFF_LAST sse2_lzcnt
;-----------------------------------------------------------------------------
-; int coeff_level_run( int16_t *dct, run_level_t *runlevel )
+; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
-%macro LAST_MASK4_MMX 2-3
- movq mm0, [%2]
- packsswb mm0, mm0
- pcmpeqb mm0, mm2
- pmovmskb %1, mm0
-%endmacro
-
%macro LZCOUNT_X86 3
bsr %1, %2
xor %1, %3
movifnidn t0, r0mp
movifnidn t1, r1mp
pxor m2, m2
- LAST_MASK t5d, t0-(%2&1)*2, t4d
+ LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
not t5d
shl t5d, 32-((%2+1)&~1)
mov t4d, %2-1
mov [t1], t4d
.loop:
LZCOUNT t3d, t5d, 0x1f
+%ifdef HIGH_BIT_DEPTH
+ mov t2d, [t0+t4*4]
+ mov [t1+t6 +4+16*4], t3b
+ mov [t1+t6*4+ 4], t2d
+%else
mov t2w, [t0+t4*2]
- mov [t1+t6 +36], t3b
+ mov [t1+t6 +4+16*2], t3b
mov [t1+t6*2+ 4], t2w
+%endif
inc t3d
shl t5d, t3b
inc t6d