;*****************************************************************************
;* trellis-64.asm: x86_64 trellis quantization
;*****************************************************************************
-;* Copyright (C) 2012 x264 project
+;* Copyright (C) 2012-2016 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
%if cpuflag(ssse3)
pabsd m%1, m%1
pmuludq m%1, m%1
-%elifdef HIGH_BIT_DEPTH
+%elif HIGH_BIT_DEPTH
ABSD m%2, m%1
SWAP %1, %2
pmuludq m%1, m%1
%endif
%endmacro
+%macro LOAD_DUP 2 ; dst, src
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movd %1, %2
+ punpcklqdq %1, %1
+%endif
+%endmacro
+
;-----------------------------------------------------------------------------
; int trellis_cabac_4x4_psy(
; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
%assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
SUB rsp, pad
DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
-%ifdef WIN64
+%if WIN64
%define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
%else
%define level_statem rsp+stack_offset+32
%define zigzagm [stack+8]
mov last_nnzm, iid
mov zigzagm, zigzagq
-%ifndef WIN64
+%if WIN64 == 0
%define orig_coefsm [stack+16]
%define quant_coefsm [stack+24]
mov orig_coefsm, orig_coefsq
mov dword levelgt1_ctxm, 9
%endif
%if psy
- movd m6, psy_trellism
+ LOAD_DUP m6, psy_trellism
%define psy_trellis m6
%elif dc
- movd m6, [unquant_mfq]
+ LOAD_DUP m6, [unquant_mfq]
paddd m6, m6
- punpcklqdq m6, m6
%define unquant_mf m6
%endif
%ifdef PIC
movzx r0, word [level_tree + r0*4]
psrld m0, 16
movd m1, [dctq + r2*SIZEOF_DCTCOEF]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
psignd m0, m1
movd [dctq + r2*SIZEOF_DCTCOEF], m0
%else
%endif
%else
mov r5d, [level_tree + r0*4]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
%else
movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
shr r5d, 16
xor r5d, r4d
sub r5d, r4d
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov [dctq + r2*SIZEOF_DCTCOEF], r5d
%else
mov [dctq + r2*SIZEOF_DCTCOEF], r5w
inc iiq
jle .writeback_loop
-%if dc
mov eax, 1
-%endif
.return:
ADD rsp, pad
RET
pxor m0, m0
mova [r10+ 0], m0
mova [r10+16], m0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova [r10+32], m0
mova [r10+48], m0
%endif
.i_loop%1:
; if( !quant_coefs[i] )
mov r6, quant_coefsm
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
%else
movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
movzx zigzagid, byte [zigzagq+iiq]
movd m0, abs_leveld
mov r6, orig_coefsm
-%ifdef HIGH_BIT_DEPTH
- movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+%if HIGH_BIT_DEPTH
+ LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
- movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
- psrad m1, 16
+ LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+ psrad m1, 16 ; sign_coef
%endif
punpcklqdq m0, m0 ; quant_coef
- punpcklqdq m1, m1 ; sign_coef
%if cpuflag(ssse3)
pabsd m0, m0
pabsd m2, m1 ; abs_coef
%else
%ifdef PIC
mov r10, unquant_mfm
- movd m3, [r10 + zigzagiq*4]
+ LOAD_DUP m3, [r10 + zigzagiq*4]
%else
- movd m3, [unquant_mfq + zigzagiq*4]
+ LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
%endif
- punpcklqdq m3, m3
pmuludq m0, m3
%endif
paddd m0, [pq_128]
%if dc
psllq m0, 8
%else
- movd m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
- punpcklqdq m5, m5
+ LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
pmuludq m0, m5
%endif
; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
; ssd1[k] -= psy_weight * psy_value;
mov r6, fenc_dctm
-%ifdef HIGH_BIT_DEPTH
- movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+%if HIGH_BIT_DEPTH
+ LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
- movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+ LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
psrad m3, 16 ; orig_coef
%endif
- punpcklqdq m3, m3
%if cpuflag(ssse3)
psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
%else
ABSD m3, m4
SWAP 4, 3
%endif
- movd m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
+ LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
pmuludq m1, psy_trellis
- punpcklqdq m1, m1
pmuludq m4, m1
psubq m0, m4
%if %1
mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
%endif
%if %1 && node_ctx == 7
+ mov r6d, levelgt1_ctxm
mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
%endif
mov r6d, [nodes_prevq + node_level_idx(j)]
add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
; n.cabac_state[levelgt1_ctx]
%if j == 7 ; && compiling support for 4:2:2
- mov r5d, levelgt1_ctxm
- %define coeff_abs_levelgt1_offs r5
+ mov r6d, levelgt1_ctxm
+ %define coeff_abs_levelgt1_offs r6
%endif
%if j == 7
movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9