x86: Fix integral_init4/8h_avx2

[x264] / common / x86 / cabac-a.asm
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index e0c52359ee789cb71ffefedffbef7c224c8820d6..d54480f16b7061e0e463a58fb6578dcde055eecb 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -1,9 +1,11 @@
  ;*****************************************************************************
-;* cabac-a.asm: h264 encoder library
+;* cabac-a.asm: x86 cabac
  ;*****************************************************************************
-;* Copyright (C) 2008 x264 project
+;* Copyright (C) 2008-2015 x264 project
  ;*
-;* Author: Loren Merritt <lorenm@u.washington.edu>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Holger Lubitz <holger@lubitz.org>
  ;*
  ;* This program is free software; you can redistribute it and/or modify
  ;* it under the terms of the GNU General Public License as published by
@@ -17,34 +19,76 @@
  ;*
  ;* You should have received a copy of the GNU General Public License
  ;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
  ;*****************************************************************************
  
  %include "x86inc.asm"
+%include "x86util.asm"
  
  SECTION_RODATA
  
-SECTION .text
-
-cextern x264_cabac_range_lps
-cextern x264_cabac_transition
-cextern x264_cabac_renorm_shift
+coeff_abs_level1_ctx:       db 1, 2, 3, 4, 0, 0, 0, 0
+coeff_abs_levelgt1_ctx:     db 5, 5, 5, 5, 6, 7, 8, 9
+coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
+                            db 4, 4, 4, 4, 5, 6, 7, 7
  
-%macro DEF_TMP 16
-    %rep 8
-        %define t%1d r%9d
-        %define t%1b r%9b
-        %define t%1  r%9
+%if ARCH_X86_64
+%macro COEFF_LAST_TABLE 17
+    %define funccpu1 %1
+    %define funccpu2 %2
+    %define funccpu3 %3
+    %rep 14
+        %ifidn %4, 4
+            dq mangle(x264_coeff_last%4_ %+ funccpu1)
+        %elifidn %4, 64
+            dq mangle(x264_coeff_last%4_ %+ funccpu2)
+        %else
+            dq mangle(x264_coeff_last%4_ %+ funccpu3)
+        %endif
          %rotate 1
      %endrep
  %endmacro
  
-; t3 must be ecx, since it's used for shift.
-%ifdef ARCH_X86_64
-    DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
+cextern coeff_last4_mmx2
+cextern coeff_last4_mmx2_lzcnt
+cextern coeff_last15_sse2
+cextern coeff_last15_sse2_lzcnt
+cextern coeff_last16_sse2
+cextern coeff_last16_sse2_lzcnt
+cextern coeff_last64_sse2
+cextern coeff_last64_sse2_lzcnt
+cextern coeff_last64_avx2_lzcnt
+
+%ifdef PIC
+SECTION .data
+%endif
+coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%endif
+
+SECTION .text
+
+cextern cabac_range_lps
+cextern cabac_transition
+cextern cabac_renorm_shift
+cextern cabac_entropy
+cextern cabac_size_unary
+cextern cabac_transition_unary
+cextern significant_coeff_flag_offset
+cextern significant_coeff_flag_offset_8x8
+cextern last_coeff_flag_offset
+cextern last_coeff_flag_offset_8x8
+cextern coeff_abs_level_m1_offset
+cextern count_cat_m1
+cextern cabac_encode_ue_bypass
+
+%if ARCH_X86_64
      %define pointer resq
  %else
-    DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
      %define pointer resd
  %endif
  
@@ -58,111 +102,655 @@ struc cb
      .end: pointer 1
      align 16, resb 1
      .bits_encoded: resd 1
-    .state: resb 460
+    .state: resb 1024
  endstruc
  
-%macro LOAD_GLOBAL 4
-%ifdef PIC64
-    ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
-    lea   r11, [%2 GLOBAL]
-    %ifnidn %3, 0
-    add   r11, %3
-    %endif
-    movzx %1, byte [r11+%4]
-%elifdef PIC32
-    %ifnidn %3, 0
-    lea   %1, [%3+%4]
-    movzx %1, byte [%2+%1 GLOBAL]
+%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
+%ifdef PIC
+    %ifidn %4, 0
+        movzx %1, byte [%2+%3+r7-$$]
      %else
-    movzx %1, byte [%2+%3+%4 GLOBAL]
+        lea   %5, [r7+%4]
+        movzx %1, byte [%2+%3+%5-$$]
      %endif
  %else
      movzx %1, byte [%2+%3+%4]
  %endif
  %endmacro
  
-cglobal x264_cabac_encode_decision_asm, 0,7
-    movifnidn t0d, r0m
+%macro CABAC 1
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
+%elif ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%else
+    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+%endif
+
+cglobal cabac_encode_decision_%1, 1,7
      movifnidn t1d, r1m
-    picgetgot t2
      mov   t5d, [r0+cb.range]
-    movzx t3d, byte [r0+cb.state+t1]
-    mov   t4d, t5d
+    movzx t6d, byte [r0+cb.state+t1]
+    movifnidn t0,  r0 ; WIN64
+    mov   t4d, ~1
+    mov   t3d, t5d
+    and   t4d, t6d
      shr   t5d, 6
-    and   t5d, 3
-    LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
-    sub   t4d, t5d
-    mov   t6d, t3d
-    shr   t6d, 6
-%ifdef PIC32
-    cmp   t6d, r2m
-%else
      movifnidn t2d, r2m
-    cmp   t6d, t2d
+%if WIN64
+    PUSH r7
  %endif
-    mov   t6d, [r0+cb.low]
-    lea   t7,  [t6+t4]
-    cmovne t4d, t5d
-    cmovne t6d, t7d
-%ifdef PIC32
-    mov   t1,  r2m
-    LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
-%else
-    LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
+%ifdef PIC
+    lea    r7, [$$]
  %endif
-    movifnidn t1d, r1m
-    mov   [r0+cb.state+t1], t3b
-.renorm:
-    mov   t3d, t4d
+    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
+    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
+    and   t6d, 1
+    sub   t3d, t5d
+    cmp   t6d, t2d
+    mov   t6d, [t0+cb.low]
+    lea    t2, [t6+t3]
+    cmovne t3d, t5d
+    cmovne t6d, t2d
+    mov   [t0+cb.state+t1], t4b
+;cabac_encode_renorm
+    mov   t4d, t3d
+%ifidn %1, bmi2
+    lzcnt t3d, t3d
+    sub   t3d, 23
+    shlx  t4d, t4d, t3d
+    shlx  t6d, t6d, t3d
+%else
      shr   t3d, 3
-    LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+    LOAD_GLOBAL t3d, cabac_renorm_shift, t3
      shl   t4d, t3b
      shl   t6d, t3b
-    add   t3d, [r0+cb.queue]
-    mov   [r0+cb.range], t4d
-    mov   [r0+cb.low], t6d
-    mov   [r0+cb.queue], t3d
-    cmp   t3d, 8
+%endif
+%if WIN64
+    POP r7
+%endif
+    mov   [t0+cb.range], t4d
+    add   t3d, [t0+cb.queue]
+    jge cabac_putbyte_%1
+.update_queue_low:
+    mov   [t0+cb.low], t6d
+    mov   [t0+cb.queue], t3d
+    RET
+
+cglobal cabac_encode_bypass_%1, 2,3
+    mov       t7d, [r0+cb.low]
+    and       r1d, [r0+cb.range]
+    lea       t7d, [t7*2+r1]
+    movifnidn  t0, r0 ; WIN64
+    mov       t3d, [r0+cb.queue]
+    inc       t3d
+%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
+    jge cabac_putbyte_%1
+%else
      jge .putbyte
-.ret:
-    REP_RET
+%endif
+    mov   [t0+cb.low], t7d
+    mov   [t0+cb.queue], t3d
+    RET
+%if ARCH_X86_64 == 0
  .putbyte:
+    PROLOGUE 0,7
+    movifnidn t6d, t7d
+    jmp cabac_putbyte_%1
+%endif
+
+%ifnidn %1,bmi2
+cglobal cabac_encode_terminal_%1, 1,3
+    sub  dword [r0+cb.range], 2
+; shortcut: the renormalization shift in terminal
+; can only be 0 or 1 and is zero over 99% of the time.
+    test dword [r0+cb.range], 0x100
+    je .renorm
+    RET
+.renorm:
+    shl  dword [r0+cb.low], 1
+    shl  dword [r0+cb.range], 1
+    inc  dword [r0+cb.queue]
+    jge .putbyte
+    RET
+.putbyte:
+    PROLOGUE 0,7
+    movifnidn t0, r0 ; WIN64
+    mov t3d, [r0+cb.queue]
+    mov t6d, [t0+cb.low]
+%endif
+
+cabac_putbyte_%1:
      ; alive: t0=cb t3=queue t6=low
-    add   t3d, 2
-    mov   t1d, 1
+%if WIN64
+    DECLARE_REG_TMP 3,6,1,0,2,5,4
+%endif
+%ifidn %1, bmi2
+    add   t3d, 10
+    shrx  t2d, t6d, t3d
+    bzhi  t6d, t6d, t3d
+    sub   t3d, 18
+%else
+    mov   t1d, -1
+    add   t3d, 10
      mov   t2d, t6d
      shl   t1d, t3b
      shr   t2d, t3b ; out
-    dec   t1d
-    sub   t3d, 10
+    not   t1d
+    sub   t3d, 18
      and   t6d, t1d
+%endif
+    mov   t5d, [t0+cb.bytes_outstanding]
      cmp   t2b, 0xff ; FIXME is a 32bit op faster?
-    mov   [r0+cb.queue], t3d
-    mov   [r0+cb.low], t6d
-    mov   t1d, t2d
-    mov   t4,  [r0+cb.p]
-    je .postpone
-    mov   t5d, [r0+cb.bytes_outstanding]
-    shr   t1d, 8 ; carry
-    lea   t6, [t4+t5+1]
-    cmp   t6, [r0+cb.end]
-    jge .ret
-    add   [t4-1], t1b
-    test  t5d, t5d
-    jz .no_outstanding
-    dec   t1d
+    jz    .postpone
+    mov    t1, [t0+cb.p]
+    add   [t1-1], t2h
+    dec   t2h
  .loop_outstanding:
-    mov   [t4], t1b
-    inc   t4
+    mov   [t1], t2h
+    inc   t1
      dec   t5d
-    jg .loop_outstanding
-.no_outstanding:
-    mov   [t4], t2b
-    inc   t4
-    mov   [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
-    mov   [r0+cb.p], t4
-    RET
+    jge .loop_outstanding
+    mov   [t1-1], t2b
+    mov   [t0+cb.p], t1
  .postpone:
-    inc   dword [r0+cb.bytes_outstanding]
+    inc   t5d
+    mov   [t0+cb.bytes_outstanding], t5d
+    jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
+%endmacro
+
+CABAC asm
+CABAC bmi2
+
+; %1 = label name
+; %2 = node_ctx init?
+%macro COEFF_ABS_LEVEL_GT1 2
+%if %2
+    %define ctx 1
+%else
+    movzx  r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
+    %define ctx r11
+%endif
+    movzx   r9d, byte [r8+ctx]
+; if( coeff_abs > 1 )
+    cmp     r1d, 1
+    jg .%1_gt1
+; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
+    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
+    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
+    lea     r0d, [r0+r9+256]
+    mov [r8+ctx], r10b
+%if %2
+    mov     r2d, 1
+%else
+    movzx   r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
+%endif
+    jmp .%1_end
+
+.%1_gt1:
+; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
+    movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
+    xor     r9d, 1
+    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
+    mov [r8+ctx], r10b
+    add     r0d, r9d
+%if %2
+    %define ctx 5
+%else
+    movzx  r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
+    %define ctx r11
+%endif
+; if( coeff_abs < 15 )
+    cmp     r1d, 15
+    jge .%1_escape
+    shl     r1d, 7
+; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
+    movzx   r9d, byte [r8+ctx]
+    add     r9d, r1d
+    movzx  r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
+; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
+    movzx   r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
+    mov [r8+ctx], r10b
+    add     r0d, r9d
+    jmp .%1_gt1_end
+
+.%1_escape:
+; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
+    movzx   r9d, byte [r8+ctx]
+    movzx  r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
+; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
+    movzx   r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
+    add     r0d, r9d
+    mov [r8+ctx], r10b
+    sub     r1d, 14
+%if cpuflag(lzcnt)
+    lzcnt   r9d, r1d
+    xor     r9d, 0x1f
+%else
+    bsr     r9d, r1d
+%endif
+; bs_size_ue_big(coeff_abs-15)<<8
+    shl     r9d, 9
+; (ilog2(coeff_abs-14)+1) << 8
+    lea     r0d, [r0+r9+256]
+.%1_gt1_end:
+%if %2
+    mov     r2d, 4
+%else
+    movzx   r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
+%endif
+.%1_end:
+%endmacro
+
+%macro LOAD_DCTCOEF 1
+%if HIGH_BIT_DEPTH
+    mov     %1, [dct+r6*4]
+%else
+    movzx   %1, word [dct+r6*2]
+%endif
+%endmacro
+
+%macro ABS_DCTCOEFS 2
+%assign i 0
+%rep %2/16
+%if HIGH_BIT_DEPTH
+    ABSD   m0, [%1+ 0+i*64], m4
+    ABSD   m1, [%1+16+i*64], m5
+    ABSD   m2, [%1+32+i*64], m4
+    ABSD   m3, [%1+48+i*64], m5
+    mova [rsp+ 0+i*64], m0
+    mova [rsp+16+i*64], m1
+    mova [rsp+32+i*64], m2
+    mova [rsp+48+i*64], m3
+%else
+    ABSW   m0, [%1+ 0+i*32], m2
+    ABSW   m1, [%1+16+i*32], m3
+    mova [rsp+ 0+i*32], m0
+    mova [rsp+16+i*32], m1
+%endif
+%assign i i+1
+%endrep
+%endmacro
+
+%macro SIG_OFFSET 1
+%if %1
+    movzx  r11d, byte [r4+r6]
+%endif
+%endmacro
+
+%macro LAST_OFFSET 1
+%if %1
+    movzx  r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
+;                                                   int ctx_block_cat, x264_cabac_t *cb );
+;-----------------------------------------------------------------------------
+
+;%1 = 8x8 mode
+%macro CABAC_RESIDUAL_RD 2
+%if %1
+    %define func cabac_block_residual_8x8_rd_internal
+    %define maxcoeffs 64
+    %define dct rsp
+%else
+    %define func cabac_block_residual_rd_internal
+    %define maxcoeffs 16
+    %define dct r4
+%endif
+
+%ifdef PIC
+    cglobal func, 4,13
+    lea     r12, [$$]
+    %define GLOBAL +r12-$$
+%else
+    cglobal func, 4,12
+    %define GLOBAL
+%endif
+
+%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
+    SUB     rsp, pad
+    shl     r1d, 4                                            ; MB_INTERLACED*16
+%if %1
+    lea      r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]     ; r12 = sig offset 8x8
+%endif
+    add     r1d, r2d
+    movzx   r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL]    ; r5 = ctx_sig
+    movzx   r7d, word [last_coeff_flag_offset+r1*2 GLOBAL]           ; r7 = ctx_last
+    movzx   r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]        ; r8 = ctx_level
+
+; abs() all the coefficients; copy them to the stack to avoid
+; changing the originals.
+; overreading is okay; it's all valid aligned data anyways.
+%if %1
+    ABS_DCTCOEFS r0, 64
+%else
+    mov      r4, r0                                           ; r4 = dct
+    mov      r6, ~SIZEOF_DCTCOEF
+    and      r6, r4                                           ; handle AC coefficient case
+    ABS_DCTCOEFS r6, 16
+    sub      r4, r6                                           ; calculate our new dct pointer
+    add      r4, rsp                                          ; restore AC coefficient offset
+%endif
+    mov      r1, [%2+gprsize*r2 GLOBAL]
+; for improved OOE performance, run coeff_last on the original coefficients.
+    call     r1                                               ; coeff_last[ctx_block_cat]( dct )
+; we know on 64-bit that the SSE2 versions of this function only
+; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
+; don't need r2 in 8x8 mode.
+    mov     r0d, [r3+cb.bits_encoded]                         ; r0 = cabac.f8_bits_encoded
+; pre-add some values to simplify addressing
+    add      r3, cb.state
+    add      r5, r3
+    add      r7, r3
+    add      r8, r3                                           ; precalculate cabac state pointers
+
+; if( last != count_cat_m1[ctx_block_cat] )
+%if %1
+    cmp     r6b, 63
+%else
+    cmp     r6b, [count_cat_m1+r2 GLOBAL]
+%endif
+    je .skip_last_sigmap
+
+; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
+; so we'll use r11 for this.
+%if %1
+    %define siglast_ctx r11
+%else
+    %define siglast_ctx r6
+%endif
+
+; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
+; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
+    SIG_OFFSET %1
+    movzx   r1d, byte [r5+siglast_ctx]
+    movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
+    xor     r1d, 1
+    movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
+    mov [r5+siglast_ctx], r9b
+    add     r0d, r1d
+
+    LAST_OFFSET %1
+    movzx   r1d, byte [r7+siglast_ctx]
+    movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
+    xor     r1d, 1
+    movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
+    mov [r7+siglast_ctx], r9b
+    add     r0d, r1d
+.skip_last_sigmap:
+    LOAD_DCTCOEF r1d
+    COEFF_ABS_LEVEL_GT1 last, 1
+; for( int i = last-1 ; i >= 0; i-- )
+    dec     r6d
+    jl .end
+.coeff_loop:
+    LOAD_DCTCOEF r1d
+; if( l[i] )
+    SIG_OFFSET %1
+    movzx   r9d, byte [r5+siglast_ctx]
+    test    r1d, r1d
+    jnz .coeff_nonzero
+; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
+    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
+    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
+    mov [r5+siglast_ctx], r10b
+    add     r0d, r9d
+    dec     r6d
+    jge .coeff_loop
+    jmp .end
+.coeff_nonzero:
+; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
+    movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
+    xor     r9d, 1
+    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
+    mov [r5+siglast_ctx], r10b
+    add     r0d, r9d
+; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
+    LAST_OFFSET %1
+    movzx   r9d, byte [r7+siglast_ctx]
+    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
+    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
+    mov [r7+siglast_ctx], r10b
+    add     r0d, r9d
+    COEFF_ABS_LEVEL_GT1 coeff, 0
+    dec     r6d
+    jge .coeff_loop
+.end:
+    mov [r3+cb.bits_encoded-cb.state], r0d
+    ADD     rsp, pad
      RET
+%endmacro
  
+%if ARCH_X86_64
+INIT_XMM sse2
+CABAC_RESIDUAL_RD 0, coeff_last_sse2
+CABAC_RESIDUAL_RD 1, coeff_last_sse2
+INIT_XMM sse2,lzcnt
+CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
+CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
+INIT_XMM ssse3
+CABAC_RESIDUAL_RD 0, coeff_last_sse2
+CABAC_RESIDUAL_RD 1, coeff_last_sse2
+INIT_XMM ssse3,lzcnt
+CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
+CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
+;                                                int ctx_block_cat, x264_cabac_t *cb );
+;-----------------------------------------------------------------------------
+
+%macro CALL_CABAC 0
+%if cpuflag(bmi2)
+    call cabac_encode_decision_bmi2
+%else
+    call cabac_encode_decision_asm
+%endif
+%if WIN64 ; move cabac back
+    mov r0, r3
+%endif
+%endmacro
+
+; %1 = 8x8 mode
+; %2 = dct register
+; %3 = countcat
+; %4 = name
+%macro SIGMAP_LOOP 3-4
+.sigmap_%4loop:
+%if HIGH_BIT_DEPTH
+    mov      %2, [dct+r10*4]
+%else
+    movsx    %2, word [dct+r10*2]
+%endif
+%if %1
+    movzx   r1d, byte [sigoff_8x8 + r10]
+    add     r1d, sigoffd
+%else
+    lea     r1d, [sigoffd + r10d]
+%endif
+    test     %2, %2
+    jz .sigmap_%4zero               ; if( l[i] )
+    inc coeffidxd
+    mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i];
+    mov     r2d, 1
+    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
+%if %1
+    movzx   r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
+    add     r1d, lastoffd
+%else
+    lea     r1d, [lastoffd + r10d]
+%endif
+    cmp    r10d, lastm              ; if( i == last )
+    je .sigmap_%4last
+    xor     r2d, r2d
+    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
+    jmp .sigmap_%4loop_endcheck
+.sigmap_%4zero:
+    xor     r2d, r2d
+    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
+.sigmap_%4loop_endcheck:
+    inc    r10d
+    cmp    r10d, %3
+    jne .sigmap_%4loop              ; if( ++i == count_m1 )
+%if HIGH_BIT_DEPTH
+    mov      %2, [dct+r10*4]
+%else
+    movsx    %2, word [dct+r10*2]
+%endif
+    inc coeffidxd
+    mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i]
+    jmp .sigmap_%4end
+.sigmap_%4last:                     ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
+    mov     r2d, 1
+    CALL_CABAC
+.sigmap_%4end:
+%if %1==0
+    jmp .level_loop_start
+%endif
+%endmacro
+
+%macro CABAC_RESIDUAL 1
+cglobal cabac_block_residual_internal, 4,15
+%ifdef PIC
+; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
+    lea     r7, [$$]
+    %define lastm [rsp+4*1]
+    %define GLOBAL +r7-$$
+%else
+    %define lastm r7d
+    %define GLOBAL
+%endif
+%assign pad gprsize+4*2+4*64-(stack_offset&15)
+    SUB     rsp, pad
+    shl     r1d, 4
+
+    %define sigoffq r8
+    %define sigoffd r8d
+    %define lastoffq r9
+    %define lastoffd r9d
+    %define leveloffq r10
+    %define leveloffd r10d
+    %define leveloffm [rsp+4*0]
+    %define countcatd r11d
+    %define sigoff_8x8 r12
+    %define coeffidxq r13
+    %define coeffidxd r13d
+    %define dct r14
+    %define coeffs rsp+4*2
+
+    lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
+    add     r1d, r2d
+    movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
+    movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
+    movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
+    movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
+    mov coeffidxd, -1
+    mov     dct, r0
+    mov leveloffm, leveloffd
+
+    mov      r1, [%1+gprsize*r2 GLOBAL]
+    call     r1
+    mov   lastm, eax
+; put cabac in r0; needed for cabac_encode_decision
+    mov      r0, r3
+
+    xor    r10d, r10d
+    cmp countcatd, 63
+    je .sigmap_8x8
+    SIGMAP_LOOP 0, r12d, countcatd,
+.sigmap_8x8:
+    SIGMAP_LOOP 1, r11d, 63, _8x8
+.level_loop_start:
+; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
+    %define nodectxq r8
+    %define nodectxd r8d
+    mov leveloffd, leveloffm
+    xor nodectxd, nodectxd
+.level_loop:
+    mov     r9d, [coeffs+coeffidxq*4]
+    mov    r11d, r9d
+    sar    r11d, 31
+    add     r9d, r11d
+    movzx   r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
+    xor     r9d, r11d
+    add     r1d, leveloffd
+    cmp     r9d, 1
+    jg .level_gt1
+    xor     r2d, r2d
+    CALL_CABAC
+    movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
+    jmp .level_sign
+.level_gt1:
+    mov     r2d, 1
+    CALL_CABAC
+    movzx  r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
+    add    r14d, leveloffd
+    cmp     r9d, 15
+    mov    r12d, 15
+    cmovl  r12d, r9d
+    sub    r12d, 2
+    jz .level_eq2
+.level_gt1_loop:
+    mov     r1d, r14d
+    mov     r2d, 1
+    CALL_CABAC
+    dec    r12d
+    jg .level_gt1_loop
+    cmp     r9d, 15
+    jge .level_bypass
+.level_eq2:
+    mov     r1d, r14d
+    xor     r2d, r2d
+    CALL_CABAC
+    jmp .level_gt1_end
+.level_bypass:
+    lea     r2d, [r9d-15]
+    xor     r1d, r1d
+    push     r0
+; we could avoid this if we implemented it in asm, but I don't feel like that
+; right now.
+%if UNIX64
+    push     r7
+    push     r8
+%else
+    sub      rsp, 32 ; shadow space
+%endif
+    call cabac_encode_ue_bypass
+%if UNIX64
+    pop      r8
+    pop      r7
+%else
+    add      rsp, 32
+%endif
+    pop      r0
+.level_gt1_end:
+    movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
+.level_sign:
+    mov     r1d, r11d
+%if cpuflag(bmi2)
+    call cabac_encode_bypass_bmi2
+%else
+    call cabac_encode_bypass_asm
+%endif
+%if WIN64
+    mov      r0, r3
+%endif
+    dec coeffidxd
+    jge .level_loop
+    ADD     rsp, pad
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse2
+CABAC_RESIDUAL coeff_last_sse2
+INIT_XMM sse2,lzcnt
+CABAC_RESIDUAL coeff_last_sse2_lzcnt
+INIT_XMM avx2,bmi2
+CABAC_RESIDUAL coeff_last_avx2_lzcnt
+%endif