git.sesse.net Git - x264/blob - common/x86/cabac-a.asm

   1 ;*****************************************************************************
   2 ;* cabac-a.asm: x86 cabac
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2008-2015 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*          Fiona Glaser <fiona@x264.com>
   8 ;*          Holger Lubitz <holger@lubitz.org>
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23 ;*
  24 ;* This program is also available under a commercial proprietary license.
  25 ;* For more information, contact us at licensing@x264.com.
  26 ;*****************************************************************************
  27
  28 %include "x86inc.asm"
  29 %include "x86util.asm"
  30
  31 SECTION_RODATA
  32
  33 coeff_abs_level1_ctx:       db 1, 2, 3, 4, 0, 0, 0, 0
  34 coeff_abs_levelgt1_ctx:     db 5, 5, 5, 5, 6, 7, 8, 9
  35 coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
  36                             db 4, 4, 4, 4, 5, 6, 7, 7
  37
  38 %if ARCH_X86_64
  39 %macro COEFF_LAST_TABLE 17
  40     %define funccpu1 %1
  41     %define funccpu2 %2
  42     %define funccpu3 %3
  43     %rep 14
  44         %ifidn %4, 4
  45             dq mangle(x264_coeff_last%4_ %+ funccpu1)
  46         %elifidn %4, 64
  47             dq mangle(x264_coeff_last%4_ %+ funccpu2)
  48         %else
  49             dq mangle(x264_coeff_last%4_ %+ funccpu3)
  50         %endif
  51         %rotate 1
  52     %endrep
  53 %endmacro
  54
  55 cextern coeff_last4_mmx2
  56 cextern coeff_last4_mmx2_lzcnt
  57 cextern coeff_last15_sse2
  58 cextern coeff_last15_sse2_lzcnt
  59 cextern coeff_last16_sse2
  60 cextern coeff_last16_sse2_lzcnt
  61 cextern coeff_last64_sse2
  62 cextern coeff_last64_sse2_lzcnt
  63 cextern coeff_last64_avx2_lzcnt
  64
  65 %ifdef PIC
  66 SECTION .data
  67 %endif
  68 coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
  69 coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
  70 coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
  71 %endif
  72
  73 SECTION .text
  74
  75 cextern cabac_range_lps
  76 cextern cabac_transition
  77 cextern cabac_renorm_shift
  78 cextern cabac_entropy
  79 cextern cabac_size_unary
  80 cextern cabac_transition_unary
  81 cextern significant_coeff_flag_offset
  82 cextern significant_coeff_flag_offset_8x8
  83 cextern last_coeff_flag_offset
  84 cextern last_coeff_flag_offset_8x8
  85 cextern coeff_abs_level_m1_offset
  86 cextern count_cat_m1
  87 cextern cabac_encode_ue_bypass
  88
  89 %if ARCH_X86_64
  90     %define pointer resq
  91 %else
  92     %define pointer resd
  93 %endif
  94
  95 struc cb
  96     .low: resd 1
  97     .range: resd 1
  98     .queue: resd 1
  99     .bytes_outstanding: resd 1
 100     .start: pointer 1
 101     .p: pointer 1
 102     .end: pointer 1
 103     align 16, resb 1
 104     .bits_encoded: resd 1
 105     .state: resb 1024
 106 endstruc
 107
 108 %macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
 109 %ifdef PIC
 110     %ifidn %4, 0
 111         movzx %1, byte [%2+%3+r7-$$]
 112     %else
 113         lea   %5, [r7+%4]
 114         movzx %1, byte [%2+%3+%5-$$]
 115     %endif
 116 %else
 117     movzx %1, byte [%2+%3+%4]
 118 %endif
 119 %endmacro
 120
 121 %macro CABAC 1
 122 ; t3 must be ecx, since it's used for shift.
 123 %if WIN64
 124     DECLARE_REG_TMP 3,1,2,0,5,6,4,4
 125 %elif ARCH_X86_64
 126     DECLARE_REG_TMP 0,1,2,3,4,5,6,6
 127 %else
 128     DECLARE_REG_TMP 0,4,2,1,3,5,6,2
 129 %endif
 130
 131 cglobal cabac_encode_decision_%1, 1,7
 132     movifnidn t1d, r1m
 133     mov   t5d, [r0+cb.range]
 134     movzx t6d, byte [r0+cb.state+t1]
 135     movifnidn t0,  r0 ; WIN64
 136     mov   t4d, ~1
 137     mov   t3d, t5d
 138     and   t4d, t6d
 139     shr   t5d, 6
 140     movifnidn t2d, r2m
 141 %if WIN64
 142     PUSH r7
 143 %endif
 144 %ifdef PIC
 145     lea    r7, [$$]
 146 %endif
 147     LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
 148     LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
 149     and   t6d, 1
 150     sub   t3d, t5d
 151     cmp   t6d, t2d
 152     mov   t6d, [t0+cb.low]
 153     lea    t2, [t6+t3]
 154     cmovne t3d, t5d
 155     cmovne t6d, t2d
 156     mov   [t0+cb.state+t1], t4b
 157 ;cabac_encode_renorm
 158     mov   t4d, t3d
 159 %ifidn %1, bmi2
 160     lzcnt t3d, t3d
 161     sub   t3d, 23
 162     shlx  t4d, t4d, t3d
 163     shlx  t6d, t6d, t3d
 164 %else
 165     shr   t3d, 3
 166     LOAD_GLOBAL t3d, cabac_renorm_shift, t3
 167     shl   t4d, t3b
 168     shl   t6d, t3b
 169 %endif
 170 %if WIN64
 171     POP r7
 172 %endif
 173     mov   [t0+cb.range], t4d
 174     add   t3d, [t0+cb.queue]
 175     jge cabac_putbyte_%1
 176 .update_queue_low:
 177     mov   [t0+cb.low], t6d
 178     mov   [t0+cb.queue], t3d
 179     RET
 180
 181 cglobal cabac_encode_bypass_%1, 2,3
 182     mov       t7d, [r0+cb.low]
 183     and       r1d, [r0+cb.range]
 184     lea       t7d, [t7*2+r1]
 185     movifnidn  t0, r0 ; WIN64
 186     mov       t3d, [r0+cb.queue]
 187     inc       t3d
 188 %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
 189     jge cabac_putbyte_%1
 190 %else
 191     jge .putbyte
 192 %endif
 193     mov   [t0+cb.low], t7d
 194     mov   [t0+cb.queue], t3d
 195     RET
 196 %if ARCH_X86_64 == 0
 197 .putbyte:
 198     PROLOGUE 0,7
 199     movifnidn t6d, t7d
 200     jmp cabac_putbyte_%1
 201 %endif
 202
 203 %ifnidn %1,bmi2
 204 cglobal cabac_encode_terminal_%1, 1,3
 205     sub  dword [r0+cb.range], 2
 206 ; shortcut: the renormalization shift in terminal
 207 ; can only be 0 or 1 and is zero over 99% of the time.
 208     test dword [r0+cb.range], 0x100
 209     je .renorm
 210     RET
 211 .renorm:
 212     shl  dword [r0+cb.low], 1
 213     shl  dword [r0+cb.range], 1
 214     inc  dword [r0+cb.queue]
 215     jge .putbyte
 216     RET
 217 .putbyte:
 218     PROLOGUE 0,7
 219     movifnidn t0, r0 ; WIN64
 220     mov t3d, [r0+cb.queue]
 221     mov t6d, [t0+cb.low]
 222 %endif
 223
 224 cabac_putbyte_%1:
 225     ; alive: t0=cb t3=queue t6=low
 226 %if WIN64
 227     DECLARE_REG_TMP 3,6,1,0,2,5,4
 228 %endif
 229 %ifidn %1, bmi2
 230     add   t3d, 10
 231     shrx  t2d, t6d, t3d
 232     bzhi  t6d, t6d, t3d
 233     sub   t3d, 18
 234 %else
 235     mov   t1d, -1
 236     add   t3d, 10
 237     mov   t2d, t6d
 238     shl   t1d, t3b
 239     shr   t2d, t3b ; out
 240     not   t1d
 241     sub   t3d, 18
 242     and   t6d, t1d
 243 %endif
 244     mov   t5d, [t0+cb.bytes_outstanding]
 245     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
 246     jz    .postpone
 247     mov    t1, [t0+cb.p]
 248     add   [t1-1], t2h
 249     dec   t2h
 250 .loop_outstanding:
 251     mov   [t1], t2h
 252     inc   t1
 253     dec   t5d
 254     jge .loop_outstanding
 255     mov   [t1-1], t2b
 256     mov   [t0+cb.p], t1
 257 .postpone:
 258     inc   t5d
 259     mov   [t0+cb.bytes_outstanding], t5d
 260     jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
 261 %endmacro
 262
 263 CABAC asm
 264 CABAC bmi2
 265
 266 ; %1 = label name
 267 ; %2 = node_ctx init?
 268 %macro COEFF_ABS_LEVEL_GT1 2
 269 %if %2
 270     %define ctx 1
 271 %else
 272     movzx  r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
 273     %define ctx r11
 274 %endif
 275     movzx   r9d, byte [r8+ctx]
 276 ; if( coeff_abs > 1 )
 277     cmp     r1d, 1
 278     jg .%1_gt1
 279 ; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
 280     movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
 281     movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
 282     lea     r0d, [r0+r9+256]
 283     mov [r8+ctx], r10b
 284 %if %2
 285     mov     r2d, 1
 286 %else
 287     movzx   r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
 288 %endif
 289     jmp .%1_end
 290
 291 .%1_gt1:
 292 ; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
 293     movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
 294     xor     r9d, 1
 295     movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
 296     mov [r8+ctx], r10b
 297     add     r0d, r9d
 298 %if %2
 299     %define ctx 5
 300 %else
 301     movzx  r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
 302     %define ctx r11
 303 %endif
 304 ; if( coeff_abs < 15 )
 305     cmp     r1d, 15
 306     jge .%1_escape
 307     shl     r1d, 7
 308 ; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
 309     movzx   r9d, byte [r8+ctx]
 310     add     r9d, r1d
 311     movzx  r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
 312 ; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
 313     movzx   r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
 314     mov [r8+ctx], r10b
 315     add     r0d, r9d
 316     jmp .%1_gt1_end
 317
 318 .%1_escape:
 319 ; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
 320     movzx   r9d, byte [r8+ctx]
 321     movzx  r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
 322 ; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
 323     movzx   r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
 324     add     r0d, r9d
 325     mov [r8+ctx], r10b
 326     sub     r1d, 14
 327 %if cpuflag(lzcnt)
 328     lzcnt   r9d, r1d
 329     xor     r9d, 0x1f
 330 %else
 331     bsr     r9d, r1d
 332 %endif
 333 ; bs_size_ue_big(coeff_abs-15)<<8
 334     shl     r9d, 9
 335 ; (ilog2(coeff_abs-14)+1) << 8
 336     lea     r0d, [r0+r9+256]
 337 .%1_gt1_end:
 338 %if %2
 339     mov     r2d, 4
 340 %else
 341     movzx   r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
 342 %endif
 343 .%1_end:
 344 %endmacro
 345
 346 %macro LOAD_DCTCOEF 1
 347 %if HIGH_BIT_DEPTH
 348     mov     %1, [dct+r6*4]
 349 %else
 350     movzx   %1, word [dct+r6*2]
 351 %endif
 352 %endmacro
 353
 354 %macro ABS_DCTCOEFS 2
 355 %assign i 0
 356 %rep %2/16
 357 %if HIGH_BIT_DEPTH
 358     ABSD   m0, [%1+ 0+i*64], m4
 359     ABSD   m1, [%1+16+i*64], m5
 360     ABSD   m2, [%1+32+i*64], m4
 361     ABSD   m3, [%1+48+i*64], m5
 362     mova [rsp+ 0+i*64], m0
 363     mova [rsp+16+i*64], m1
 364     mova [rsp+32+i*64], m2
 365     mova [rsp+48+i*64], m3
 366 %else
 367     ABSW   m0, [%1+ 0+i*32], m2
 368     ABSW   m1, [%1+16+i*32], m3
 369     mova [rsp+ 0+i*32], m0
 370     mova [rsp+16+i*32], m1
 371 %endif
 372 %assign i i+1
 373 %endrep
 374 %endmacro
 375
 376 %macro SIG_OFFSET 1
 377 %if %1
 378     movzx  r11d, byte [r4+r6]
 379 %endif
 380 %endmacro
 381
 382 %macro LAST_OFFSET 1
 383 %if %1
 384     movzx  r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
 385 %endif
 386 %endmacro
 387
 388 ;-----------------------------------------------------------------------------
 389 ; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
 390 ;                                                   int ctx_block_cat, x264_cabac_t *cb );
 391 ;-----------------------------------------------------------------------------
 392
 393 ;%1 = 8x8 mode
 394 %macro CABAC_RESIDUAL_RD 2
 395 %if %1
 396     %define func cabac_block_residual_8x8_rd_internal
 397     %define maxcoeffs 64
 398     %define dct rsp
 399 %else
 400     %define func cabac_block_residual_rd_internal
 401     %define maxcoeffs 16
 402     %define dct r4
 403 %endif
 404
 405 %ifdef PIC
 406     cglobal func, 4,13
 407     lea     r12, [$$]
 408     %define GLOBAL +r12-$$
 409 %else
 410     cglobal func, 4,12
 411     %define GLOBAL
 412 %endif
 413
 414 %assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
 415     SUB     rsp, pad
 416     shl     r1d, 4                                            ; MB_INTERLACED*16
 417 %if %1
 418     lea      r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]     ; r12 = sig offset 8x8
 419 %endif
 420     add     r1d, r2d
 421     movzx   r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL]    ; r5 = ctx_sig
 422     movzx   r7d, word [last_coeff_flag_offset+r1*2 GLOBAL]           ; r7 = ctx_last
 423     movzx   r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]        ; r8 = ctx_level
 424
 425 ; abs() all the coefficients; copy them to the stack to avoid
 426 ; changing the originals.
 427 ; overreading is okay; it's all valid aligned data anyways.
 428 %if %1
 429     ABS_DCTCOEFS r0, 64
 430 %else
 431     mov      r4, r0                                           ; r4 = dct
 432     mov      r6, ~SIZEOF_DCTCOEF
 433     and      r6, r4                                           ; handle AC coefficient case
 434     ABS_DCTCOEFS r6, 16
 435     sub      r4, r6                                           ; calculate our new dct pointer
 436     add      r4, rsp                                          ; restore AC coefficient offset
 437 %endif
 438     mov      r1, [%2+gprsize*r2 GLOBAL]
 439 ; for improved OOE performance, run coeff_last on the original coefficients.
 440     call     r1                                               ; coeff_last[ctx_block_cat]( dct )
 441 ; we know on 64-bit that the SSE2 versions of this function only
 442 ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
 443 ; don't need r2 in 8x8 mode.
 444     mov     r0d, [r3+cb.bits_encoded]                         ; r0 = cabac.f8_bits_encoded
 445 ; pre-add some values to simplify addressing
 446     add      r3, cb.state
 447     add      r5, r3
 448     add      r7, r3
 449     add      r8, r3                                           ; precalculate cabac state pointers
 450
 451 ; if( last != count_cat_m1[ctx_block_cat] )
 452 %if %1
 453     cmp     r6b, 63
 454 %else
 455     cmp     r6b, [count_cat_m1+r2 GLOBAL]
 456 %endif
 457     je .skip_last_sigmap
 458
 459 ; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
 460 ; so we'll use r11 for this.
 461 %if %1
 462     %define siglast_ctx r11
 463 %else
 464     %define siglast_ctx r6
 465 %endif
 466
 467 ; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
 468 ; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
 469     SIG_OFFSET %1
 470     movzx   r1d, byte [r5+siglast_ctx]
 471     movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
 472     xor     r1d, 1
 473     movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
 474     mov [r5+siglast_ctx], r9b
 475     add     r0d, r1d
 476
 477     LAST_OFFSET %1
 478     movzx   r1d, byte [r7+siglast_ctx]
 479     movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
 480     xor     r1d, 1
 481     movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
 482     mov [r7+siglast_ctx], r9b
 483     add     r0d, r1d
 484 .skip_last_sigmap:
 485     LOAD_DCTCOEF r1d
 486     COEFF_ABS_LEVEL_GT1 last, 1
 487 ; for( int i = last-1 ; i >= 0; i-- )
 488     dec     r6d
 489     jl .end
 490 .coeff_loop:
 491     LOAD_DCTCOEF r1d
 492 ; if( l[i] )
 493     SIG_OFFSET %1
 494     movzx   r9d, byte [r5+siglast_ctx]
 495     test    r1d, r1d
 496     jnz .coeff_nonzero
 497 ; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
 498     movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
 499     movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
 500     mov [r5+siglast_ctx], r10b
 501     add     r0d, r9d
 502     dec     r6d
 503     jge .coeff_loop
 504     jmp .end
 505 .coeff_nonzero:
 506 ; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
 507     movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
 508     xor     r9d, 1
 509     movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
 510     mov [r5+siglast_ctx], r10b
 511     add     r0d, r9d
 512 ; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
 513     LAST_OFFSET %1
 514     movzx   r9d, byte [r7+siglast_ctx]
 515     movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
 516     movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
 517     mov [r7+siglast_ctx], r10b
 518     add     r0d, r9d
 519     COEFF_ABS_LEVEL_GT1 coeff, 0
 520     dec     r6d
 521     jge .coeff_loop
 522 .end:
 523     mov [r3+cb.bits_encoded-cb.state], r0d
 524     ADD     rsp, pad
 525     RET
 526 %endmacro
 527
 528 %if ARCH_X86_64
 529 INIT_XMM sse2
 530 CABAC_RESIDUAL_RD 0, coeff_last_sse2
 531 CABAC_RESIDUAL_RD 1, coeff_last_sse2
 532 INIT_XMM sse2,lzcnt
 533 CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
 534 CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
 535 INIT_XMM ssse3
 536 CABAC_RESIDUAL_RD 0, coeff_last_sse2
 537 CABAC_RESIDUAL_RD 1, coeff_last_sse2
 538 INIT_XMM ssse3,lzcnt
 539 CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
 540 CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
 541 %endif
 542
 543 ;-----------------------------------------------------------------------------
 544 ; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
 545 ;                                                int ctx_block_cat, x264_cabac_t *cb );
 546 ;-----------------------------------------------------------------------------
 547
 548 %macro CALL_CABAC 0
 549 %if cpuflag(bmi2)
 550     call cabac_encode_decision_bmi2
 551 %else
 552     call cabac_encode_decision_asm
 553 %endif
 554 %if WIN64 ; move cabac back
 555     mov r0, r3
 556 %endif
 557 %endmacro
 558
 559 ; %1 = 8x8 mode
 560 ; %2 = dct register
 561 ; %3 = countcat
 562 ; %4 = name
 563 %macro SIGMAP_LOOP 3-4
 564 .sigmap_%4loop:
 565 %if HIGH_BIT_DEPTH
 566     mov      %2, [dct+r10*4]
 567 %else
 568     movsx    %2, word [dct+r10*2]
 569 %endif
 570 %if %1
 571     movzx   r1d, byte [sigoff_8x8 + r10]
 572     add     r1d, sigoffd
 573 %else
 574     lea     r1d, [sigoffd + r10d]
 575 %endif
 576     test     %2, %2
 577     jz .sigmap_%4zero               ; if( l[i] )
 578     inc coeffidxd
 579     mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i];
 580     mov     r2d, 1
 581     CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
 582 %if %1
 583     movzx   r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
 584     add     r1d, lastoffd
 585 %else
 586     lea     r1d, [lastoffd + r10d]
 587 %endif
 588     cmp    r10d, lastm              ; if( i == last )
 589     je .sigmap_%4last
 590     xor     r2d, r2d
 591     CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
 592     jmp .sigmap_%4loop_endcheck
 593 .sigmap_%4zero:
 594     xor     r2d, r2d
 595     CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
 596 .sigmap_%4loop_endcheck:
 597     inc    r10d
 598     cmp    r10d, %3
 599     jne .sigmap_%4loop              ; if( ++i == count_m1 )
 600 %if HIGH_BIT_DEPTH
 601     mov      %2, [dct+r10*4]
 602 %else
 603     movsx    %2, word [dct+r10*2]
 604 %endif
 605     inc coeffidxd
 606     mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i]
 607     jmp .sigmap_%4end
 608 .sigmap_%4last:                     ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
 609     mov     r2d, 1
 610     CALL_CABAC
 611 .sigmap_%4end:
 612 %if %1==0
 613     jmp .level_loop_start
 614 %endif
 615 %endmacro
 616
 617 %macro CABAC_RESIDUAL 1
 618 cglobal cabac_block_residual_internal, 4,15
 619 %ifdef PIC
 620 ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
 621     lea     r7, [$$]
 622     %define lastm [rsp+4*1]
 623     %define GLOBAL +r7-$$
 624 %else
 625     %define lastm r7d
 626     %define GLOBAL
 627 %endif
 628 %assign pad gprsize+4*2+4*64-(stack_offset&15)
 629     SUB     rsp, pad
 630     shl     r1d, 4
 631
 632     %define sigoffq r8
 633     %define sigoffd r8d
 634     %define lastoffq r9
 635     %define lastoffd r9d
 636     %define leveloffq r10
 637     %define leveloffd r10d
 638     %define leveloffm [rsp+4*0]
 639     %define countcatd r11d
 640     %define sigoff_8x8 r12
 641     %define coeffidxq r13
 642     %define coeffidxd r13d
 643     %define dct r14
 644     %define coeffs rsp+4*2
 645
 646     lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
 647     add     r1d, r2d
 648     movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
 649     movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
 650     movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
 651     movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
 652     mov coeffidxd, -1
 653     mov     dct, r0
 654     mov leveloffm, leveloffd
 655
 656     mov      r1, [%1+gprsize*r2 GLOBAL]
 657     call     r1
 658     mov   lastm, eax
 659 ; put cabac in r0; needed for cabac_encode_decision
 660     mov      r0, r3
 661
 662     xor    r10d, r10d
 663     cmp countcatd, 63
 664     je .sigmap_8x8
 665     SIGMAP_LOOP 0, r12d, countcatd,
 666 .sigmap_8x8:
 667     SIGMAP_LOOP 1, r11d, 63, _8x8
 668 .level_loop_start:
 669 ; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
 670     %define nodectxq r8
 671     %define nodectxd r8d
 672     mov leveloffd, leveloffm
 673     xor nodectxd, nodectxd
 674 .level_loop:
 675     mov     r9d, [coeffs+coeffidxq*4]
 676     mov    r11d, r9d
 677     sar    r11d, 31
 678     add     r9d, r11d
 679     movzx   r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
 680     xor     r9d, r11d
 681     add     r1d, leveloffd
 682     cmp     r9d, 1
 683     jg .level_gt1
 684     xor     r2d, r2d
 685     CALL_CABAC
 686     movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
 687     jmp .level_sign
 688 .level_gt1:
 689     mov     r2d, 1
 690     CALL_CABAC
 691     movzx  r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
 692     add    r14d, leveloffd
 693     cmp     r9d, 15
 694     mov    r12d, 15
 695     cmovl  r12d, r9d
 696     sub    r12d, 2
 697     jz .level_eq2
 698 .level_gt1_loop:
 699     mov     r1d, r14d
 700     mov     r2d, 1
 701     CALL_CABAC
 702     dec    r12d
 703     jg .level_gt1_loop
 704     cmp     r9d, 15
 705     jge .level_bypass
 706 .level_eq2:
 707     mov     r1d, r14d
 708     xor     r2d, r2d
 709     CALL_CABAC
 710     jmp .level_gt1_end
 711 .level_bypass:
 712     lea     r2d, [r9d-15]
 713     xor     r1d, r1d
 714     push     r0
 715 ; we could avoid this if we implemented it in asm, but I don't feel like that
 716 ; right now.
 717 %if UNIX64
 718     push     r7
 719     push     r8
 720 %else
 721     sub      rsp, 32 ; shadow space
 722 %endif
 723     call cabac_encode_ue_bypass
 724 %if UNIX64
 725     pop      r8
 726     pop      r7
 727 %else
 728     add      rsp, 32
 729 %endif
 730     pop      r0
 731 .level_gt1_end:
 732     movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
 733 .level_sign:
 734     mov     r1d, r11d
 735 %if cpuflag(bmi2)
 736     call cabac_encode_bypass_bmi2
 737 %else
 738     call cabac_encode_bypass_asm
 739 %endif
 740 %if WIN64
 741     mov      r0, r3
 742 %endif
 743     dec coeffidxd
 744     jge .level_loop
 745     ADD     rsp, pad
 746     RET
 747 %endmacro
 748
 749 %if ARCH_X86_64
 750 INIT_XMM sse2
 751 CABAC_RESIDUAL coeff_last_sse2
 752 INIT_XMM sse2,lzcnt
 753 CABAC_RESIDUAL coeff_last_sse2_lzcnt
 754 INIT_XMM avx2,bmi2
 755 CABAC_RESIDUAL coeff_last_avx2_lzcnt
 756 %endif