git.sesse.net Git - x264/blob - common/x86/trellis-64.asm

   1 ;*****************************************************************************
   2 ;* trellis-64.asm: x86_64 trellis quantization
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2012-2016 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*
   8 ;* This program is free software; you can redistribute it and/or modify
   9 ;* it under the terms of the GNU General Public License as published by
  10 ;* the Free Software Foundation; either version 2 of the License, or
  11 ;* (at your option) any later version.
  12 ;*
  13 ;* This program is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 ;* GNU General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU General Public License
  19 ;* along with this program; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  21 ;*
  22 ;* This program is also available under a commercial proprietary license.
  23 ;* For more information, contact us at licensing@x264.com.
  24 ;*****************************************************************************
  25
  26 ; This is a pretty straight-forward translation of the C code, except:
  27 ; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
  28 ; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
  29 ;   4x parallel, handling 4 node_ctxs of the same coef (even if some of those
  30 ;   nodes are invalid).
  31 ; * Interprocedural register allocation. Eliminates argument-passing overhead
  32 ;   to trellis_coef* subroutines. Also reduces codesize.
  33
  34 ; Optimizations that I tried, and rejected because they were not faster:
  35 ; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
  36 ;   Costs too much icache compared to the negligible speedup.
  37 ; * There are only 21 possible sets of live node_ctxs; we could keep track of
  38 ;   exactly which set we're in and feed that (along with abs_level) into a jump
  39 ;   table instead of the switch to select a trellis_coef subroutine. This would
  40 ;   eliminate all branches about which node_ctxs are live, but costs either a
  41 ;   bunch of icache or a bunch of call/ret, and the jump table itself is
  42 ;   unpredictable.
  43 ; * Separate versions of trellis_coef* depending on whether we're doing the 1st
  44 ;   or the 2nd of the two abs_level candidates. This would eliminate some
  45 ;   branches about if(score is better).
  46 ; * Special case more values of coef. I had a coef2 at some intermediate point
  47 ;   in the optimization process, but it didn't end up worthwhile in conjunction
  48 ;   with all the other optimizations.
  49 ; * Unroll or simd writeback. I don't know why this didn't help.
  50
  51 %include "x86inc.asm"
  52 %include "x86util.asm"
  53
  54 SECTION_RODATA
  55
  56 pd_8: times 4 dd 8
  57 pd_m16: times 4 dd -16
  58 pd_0123: dd 0, 1, 2, 3
  59 pd_4567: dd 4, 5, 6, 7
  60 sq_1: dq 1, 0
  61 pq_128: times 2 dq 128
  62 pq_ffffffff: times 2 dq 0xffffffff
  63
  64 cextern cabac_entropy
  65 cextern cabac_transition
  66 cextern cabac_size_unary
  67 cextern cabac_transition_unary
  68 cextern dct4_weight_tab
  69 cextern dct8_weight_tab
  70 cextern dct4_weight2_tab
  71 cextern dct8_weight2_tab
  72 cextern last_coeff_flag_offset_8x8
  73 cextern significant_coeff_flag_offset_8x8
  74 cextern coeff_flag_offset_chroma_422_dc
  75
  76 SECTION .text
  77
  78 %define TRELLIS_SCORE_BIAS 1<<60
  79 %define SIZEOF_NODE 16
  80 %define CABAC_SIZE_BITS 8
  81 %define LAMBDA_BITS 4
  82
  83 %macro SQUARE 2 ; dst, tmp
  84     ; could use pmuldq here, to eliminate the abs. but that would involve
  85     ; templating a sse4 version of all of trellis, for negligible speedup.
  86 %if cpuflag(ssse3)
  87     pabsd   m%1, m%1
  88     pmuludq m%1, m%1
  89 %elif HIGH_BIT_DEPTH
  90     ABSD    m%2, m%1
  91     SWAP     %1, %2
  92     pmuludq m%1, m%1
  93 %else
  94     pmuludq m%1, m%1
  95     pand    m%1, [pq_ffffffff]
  96 %endif
  97 %endmacro
  98
  99 %macro LOAD_DUP 2 ; dst, src
 100 %if cpuflag(ssse3)
 101     movddup    %1, %2
 102 %else
 103     movd       %1, %2
 104     punpcklqdq %1, %1
 105 %endif
 106 %endmacro
 107
 108 ;-----------------------------------------------------------------------------
 109 ; int trellis_cabac_4x4_psy(
 110 ;     const int *unquant_mf, const uint8_t *zigzag, int lambda2,
 111 ;     int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
 112 ;     uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
 113 ;     uint64_t level_state0, uint16_t level_state1,
 114 ;     int b_ac, dctcoef *fenc_dct, int psy_trellis )
 115 ;-----------------------------------------------------------------------------
 116 %macro TRELLIS 4
 117 %define num_coefs %2
 118 %define dc %3
 119 %define psy %4
 120 cglobal %1, 4,15,9
 121     %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
 122     %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
 123     SUB  rsp, pad
 124     DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
 125 %if WIN64
 126     %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
 127 %else
 128     %define level_statem rsp+stack_offset+32
 129 %endif
 130     %define b_acm r11m ; 4x4 only
 131     %define b_interlacedm r11m ; 8x8 only
 132     %define i_coefsm1 r11m ; dc only
 133     %define fenc_dctm r12m
 134     %define psy_trellism r13m
 135 %if num_coefs == 64
 136     shl dword b_interlacedm, 6
 137     %define dct_weight1_tab dct8_weight_tab
 138     %define dct_weight2_tab dct8_weight2_tab
 139 %else
 140     %define dct_weight1_tab dct4_weight_tab
 141     %define dct_weight2_tab dct4_weight2_tab
 142 %endif
 143
 144     %define stack rsp
 145     %define last_nnzm [stack+0]
 146     %define zigzagm   [stack+8]
 147     mov     last_nnzm, iid
 148     mov     zigzagm,   zigzagq
 149 %if WIN64 == 0
 150     %define orig_coefsm  [stack+16]
 151     %define quant_coefsm [stack+24]
 152     mov     orig_coefsm,  orig_coefsq
 153     mov     quant_coefsm, quant_coefsq
 154 %endif
 155     %define unquant_mfm   [stack+32]
 156     %define levelgt1_ctxm [stack+40]
 157     %define ssd            stack+48
 158     %define cost_siglast   stack+80
 159     %define level_tree     stack+96
 160
 161     ; trellis_node_t is layed out differently than C.
 162     ; struct-of-arrays rather than array-of-structs, for simd.
 163     %define nodes_curq r7
 164     %define nodes_prevq r8
 165     %define node_score(x) x*8
 166     %define node_level_idx(x) 64+x*4
 167     %define node_cabac_state(x) 96+x*4
 168     lea nodes_curq, [level_tree + level_tree_size]
 169     lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
 170     mov        r6, TRELLIS_SCORE_BIAS
 171     mov       [nodes_curq + node_score(0)], r6
 172     mov dword [nodes_curq + node_level_idx(0)], 0
 173     movd      mm0, [level_statem + 0]
 174     punpcklbw mm0, [level_statem + 4]
 175     punpcklwd mm0, [level_statem + 8]
 176     %define level_state_packed mm0 ; version for copying into node.cabac_state
 177     pcmpeqb    m7, m7 ; TRELLIS_SCORE_MAX
 178     movq [nodes_curq + node_score(1)], m7
 179     mova [nodes_curq + node_score(2)], m7
 180
 181     %define levels_usedq r4
 182     %define levels_usedd r4d
 183     mov dword [level_tree], 0
 184     mov       levels_usedd, 1
 185
 186     %define abs_levelq r9
 187     %define abs_leveld r9d
 188     %define abs_coefq r14
 189     %define zigzagiq r5
 190     %define zigzagid r5d
 191
 192 %if num_coefs == 8
 193     mov dword levelgt1_ctxm, 8
 194 %else
 195     mov dword levelgt1_ctxm, 9
 196 %endif
 197 %if psy
 198     LOAD_DUP m6, psy_trellism
 199     %define psy_trellis m6
 200 %elif dc
 201     LOAD_DUP   m6, [unquant_mfq]
 202     paddd      m6, m6
 203     %define unquant_mf m6
 204 %endif
 205 %ifdef PIC
 206 %if dc == 0
 207     mov unquant_mfm, unquant_mfq
 208 %endif
 209     ; Keep a single offset register to PICify all global constants.
 210     ; They're all relative to "beginning of this asm file's .text section",
 211     ; even tables that aren't in this file.
 212     ; (Any address in .text would work, this one was just convenient.)
 213     lea r0, [$$]
 214     %define GLOBAL +r0-$$
 215 %else
 216     %define GLOBAL
 217 %endif
 218
 219     TRELLIS_LOOP 0 ; node_ctx 0..3
 220     TRELLIS_LOOP 1 ; node_ctx 1..7
 221
 222 .writeback:
 223     ; int level = bnode->level_idx;
 224     ; for( int i = b_ac; i <= last_nnz; i++ )
 225     ;     dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
 226     ;     level = level_tree[level].next;
 227     mov    iid, last_nnzm
 228     add zigzagq, iiq
 229     neg    iiq
 230 %if num_coefs == 16 && dc == 0
 231     mov    r2d, b_acm
 232     add    iiq, r2
 233 %endif
 234     %define dctq r10
 235     mov    r0d, [nodes_curq + node_level_idx(0) + rax*4]
 236 .writeback_loop:
 237     movzx   r2, byte [zigzagq + iiq]
 238 %if cpuflag(ssse3)
 239     movd    m0, [level_tree + r0*4]
 240     movzx   r0, word [level_tree + r0*4]
 241     psrld   m0, 16
 242     movd    m1, [dctq + r2*SIZEOF_DCTCOEF]
 243 %if HIGH_BIT_DEPTH
 244     psignd  m0, m1
 245     movd [dctq + r2*SIZEOF_DCTCOEF], m0
 246 %else
 247     psignw  m0, m1
 248     movd   r4d, m0
 249     mov  [dctq + r2*SIZEOF_DCTCOEF], r4w
 250 %endif
 251 %else
 252     mov    r5d, [level_tree + r0*4]
 253 %if HIGH_BIT_DEPTH
 254     mov    r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
 255 %else
 256     movsx  r4d, word [dctq + r2*SIZEOF_DCTCOEF]
 257 %endif
 258     movzx  r0d, r5w
 259     sar    r4d, 31
 260     shr    r5d, 16
 261     xor    r5d, r4d
 262     sub    r5d, r4d
 263 %if HIGH_BIT_DEPTH
 264     mov  [dctq + r2*SIZEOF_DCTCOEF], r5d
 265 %else
 266     mov  [dctq + r2*SIZEOF_DCTCOEF], r5w
 267 %endif
 268 %endif
 269     inc    iiq
 270     jle .writeback_loop
 271
 272     mov eax, 1
 273 .return:
 274     ADD rsp, pad
 275     RET
 276
 277 %if num_coefs == 16 && dc == 0
 278 .return_zero:
 279     pxor       m0, m0
 280     mova [r10+ 0], m0
 281     mova [r10+16], m0
 282 %if HIGH_BIT_DEPTH
 283     mova [r10+32], m0
 284     mova [r10+48], m0
 285 %endif
 286     jmp .return
 287 %endif
 288 %endmacro ; TRELLIS
 289
 290
 291
 292 %macro TRELLIS_LOOP 1 ; ctx_hi
 293 .i_loop%1:
 294     ; if( !quant_coefs[i] )
 295     mov   r6, quant_coefsm
 296 %if HIGH_BIT_DEPTH
 297     mov   abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
 298 %else
 299     movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
 300 %endif
 301
 302     ; int sigindex  = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
 303     ;                 num_coefs == 8  ? coeff_flag_offset_chroma_422_dc[i] : i;
 304     mov    r10, cabac_state_sigm
 305 %if num_coefs == 64
 306     mov    r6d, b_interlacedm
 307 %ifdef PIC
 308     add    r6d, iid
 309     movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
 310 %else
 311     movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
 312 %endif
 313     movzx  r10, byte [r10 + r6]
 314 %elif num_coefs == 8
 315     movzx  r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
 316     movzx  r10, byte [r10 + r13]
 317 %else
 318     movzx  r10, byte [r10 + iiq]
 319 %endif
 320
 321     test  abs_leveld, abs_leveld
 322     jnz %%.nonzero_quant_coef
 323
 324 %if %1 == 0
 325     ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
 326     ;               * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 327     ; nodes_cur[0].score -= cost_sig0;
 328     movzx  r10, word [cabac_entropy + r10*2 GLOBAL]
 329     imul   r10, lambda2q
 330     shr    r10, CABAC_SIZE_BITS - LAMBDA_BITS
 331     sub   [nodes_curq + node_score(0)], r10
 332 %endif
 333     ZERO_LEVEL_IDX %1, cur
 334     jmp .i_continue%1
 335
 336 %%.nonzero_quant_coef:
 337     ; int sign_coef = orig_coefs[zigzag[i]];
 338     ; int abs_coef = abs( sign_coef );
 339     ; int q = abs( quant_coefs[i] );
 340     movzx   zigzagid, byte [zigzagq+iiq]
 341     movd    m0, abs_leveld
 342     mov     r6, orig_coefsm
 343 %if HIGH_BIT_DEPTH
 344     LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
 345 %else
 346     LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
 347     psrad    m1, 16     ; sign_coef
 348 %endif
 349     punpcklqdq m0, m0 ; quant_coef
 350 %if cpuflag(ssse3)
 351     pabsd   m0, m0
 352     pabsd   m2, m1 ; abs_coef
 353 %else
 354     pxor    m8, m8
 355     pcmpgtd m8, m1 ; sign_mask
 356     pxor    m0, m8
 357     pxor    m2, m1, m8
 358     psubd   m0, m8
 359     psubd   m2, m8
 360 %endif
 361     psubd   m0, [sq_1] ; abs_level
 362     movd  abs_leveld, m0
 363
 364     xchg  nodes_curq, nodes_prevq
 365
 366     ; if( i < num_coefs-1 )
 367     ;     int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
 368     ;                     num_coefs == 8  ? coeff_flag_offset_chroma_422_dc[i] : i
 369     ;     cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
 370     ;     cost_sig1       = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
 371     ;     cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
 372     ;     cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
 373 %if %1 == 0
 374 %if dc && num_coefs != 8
 375     cmp    iid, i_coefsm1
 376 %else
 377     cmp    iid, num_coefs-1
 378 %endif
 379     je %%.zero_siglast
 380 %endif
 381     movzx  r11, word [cabac_entropy + r10*2 GLOBAL]
 382     xor    r10, 1
 383     movzx  r12, word [cabac_entropy + r10*2 GLOBAL]
 384     mov   [cost_siglast+0], r11d
 385     mov    r10, cabac_state_lastm
 386 %if num_coefs == 64
 387     movzx  r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
 388     movzx  r10, byte [r10 + r6]
 389 %elif num_coefs == 8
 390     movzx  r10, byte [r10 + r13]
 391 %else
 392     movzx  r10, byte [r10 + iiq]
 393 %endif
 394     movzx  r11, word [cabac_entropy + r10*2 GLOBAL]
 395     add    r11, r12
 396     mov   [cost_siglast+4], r11d
 397 %if %1 == 0
 398     xor    r10, 1
 399     movzx  r10, word [cabac_entropy + r10*2 GLOBAL]
 400     add    r10, r12
 401     mov   [cost_siglast+8], r10d
 402 %endif
 403 %%.skip_siglast:
 404
 405     ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
 406     ; int d = abs_coef - unquant_abs_level;
 407     ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
 408 %if dc
 409     pmuludq m0, unquant_mf
 410 %else
 411 %ifdef PIC
 412     mov    r10, unquant_mfm
 413     LOAD_DUP m3, [r10 + zigzagiq*4]
 414 %else
 415     LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
 416 %endif
 417     pmuludq m0, m3
 418 %endif
 419     paddd   m0, [pq_128]
 420     psrld   m0, 8 ; unquant_abs_level
 421 %if psy || dc == 0
 422     mova    m4, m0
 423 %endif
 424     psubd   m0, m2
 425     SQUARE   0, 3
 426 %if dc
 427     psllq   m0, 8
 428 %else
 429     LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
 430     pmuludq m0, m5
 431 %endif
 432
 433 %if psy
 434     test   iid, iid
 435     jz %%.dc_rounding
 436     ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
 437     ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
 438     ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
 439     ; ssd1[k] -= psy_weight * psy_value;
 440     mov     r6, fenc_dctm
 441 %if HIGH_BIT_DEPTH
 442     LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
 443 %else
 444     LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
 445     psrad   m3, 16 ; orig_coef
 446 %endif
 447 %if cpuflag(ssse3)
 448     psignd  m4, m1 ; SIGN(unquant_abs_level, sign_coef)
 449 %else
 450     PSIGN d, m4, m8
 451 %endif
 452     psubd   m3, m1 ; predicted_coef
 453     paddd   m4, m3
 454 %if cpuflag(ssse3)
 455     pabsd   m4, m4
 456 %else
 457     ABSD    m3, m4
 458     SWAP     4, 3
 459 %endif
 460     LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
 461     pmuludq m1, psy_trellis
 462     pmuludq m4, m1
 463     psubq   m0, m4
 464 %if %1
 465 %%.dc_rounding:
 466 %endif
 467 %endif
 468 %if %1 == 0
 469     mova [ssd], m0
 470 %endif
 471
 472 %if dc == 0 && %1 == 0
 473     test   iid, iid
 474     jnz %%.skip_dc_rounding
 475 %%.dc_rounding:
 476     ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
 477     ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
 478     ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
 479     psrad   m1, 31 ; sign_coef>>31
 480     paddd   m4, [pd_8]
 481     paddd   m4, m1
 482     pand    m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
 483     psubd   m4, m2 ; d
 484     SQUARE   4, 3
 485     pmuludq m4, m5
 486     mova [ssd], m4
 487 %%.skip_dc_rounding:
 488 %endif
 489     mova [ssd+16], m0
 490
 491     %assign stack_offset_bak stack_offset
 492     cmp abs_leveld, 1
 493     jl %%.switch_coef0
 494 %if %1 == 0
 495     mov    r10, [ssd] ; trellis_coef* args
 496 %endif
 497     movq   r12, m0
 498     ; for( int j = 0; j < 8; j++ )
 499     ;     nodes_cur[j].score = TRELLIS_SCORE_MAX;
 500 %if cpuflag(ssse3)
 501     mova [nodes_curq + node_score(0)], m7
 502     mova [nodes_curq + node_score(2)], m7
 503 %else ; avoid store-forwarding stalls on k8/k10
 504 %if %1 == 0
 505     movq [nodes_curq + node_score(0)], m7
 506 %endif
 507     movq [nodes_curq + node_score(1)], m7
 508     movq [nodes_curq + node_score(2)], m7
 509     movq [nodes_curq + node_score(3)], m7
 510 %endif
 511     mova [nodes_curq + node_score(4)], m7
 512     mova [nodes_curq + node_score(6)], m7
 513     je %%.switch_coef1
 514 %%.switch_coefn:
 515     call trellis_coefn.entry%1
 516     call trellis_coefn.entry%1b
 517     jmp .i_continue1
 518 %%.switch_coef1:
 519     call trellis_coef1.entry%1
 520     call trellis_coefn.entry%1b
 521     jmp .i_continue1
 522 %%.switch_coef0:
 523     call trellis_coef0_%1
 524     call trellis_coef1.entry%1b
 525
 526 .i_continue%1:
 527     dec iid
 528 %if num_coefs == 16 && dc == 0
 529     cmp iid, b_acm
 530 %endif
 531     jge .i_loop%1
 532
 533     call trellis_bnode_%1
 534 %if %1 == 0
 535 %if num_coefs == 16 && dc == 0
 536     jz .return_zero
 537 %else
 538     jz .return
 539 %endif
 540     jmp .writeback
 541
 542 %%.zero_siglast:
 543     xor  r6d, r6d
 544     mov [cost_siglast+0], r6
 545     mov [cost_siglast+8], r6d
 546     jmp %%.skip_siglast
 547 %endif
 548 %endmacro ; TRELLIS_LOOP
 549
 550 ; just a synonym for %if
 551 %macro IF0 1+
 552 %endmacro
 553 %macro IF1 1+
 554     %1
 555 %endmacro
 556
 557 %macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
 558     ; for( int j = 0; j < 8; j++ )
 559     ;     nodes_cur[j].level_idx = levels_used;
 560     ;     level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
 561     ;     levels_used++;
 562     add  levels_usedd, 3
 563     and  levels_usedd, ~3 ; allow aligned stores
 564     movd       m0, levels_usedd
 565     pshufd     m0, m0, 0
 566     IF%1 mova  m1, m0
 567          paddd m0, [pd_0123]
 568     IF%1 paddd m1, [pd_4567]
 569          mova  m2, [nodes_%2q + node_level_idx(0)]
 570     IF%1 mova  m3, [nodes_%2q + node_level_idx(4)]
 571          mova [nodes_curq + node_level_idx(0)], m0
 572     IF%1 mova [nodes_curq + node_level_idx(4)], m1
 573          mova [level_tree + (levels_usedq+0)*4], m2
 574     IF%1 mova [level_tree + (levels_usedq+4)*4], m3
 575     add  levels_usedd, (1+%1)*4
 576 %endmacro
 577
 578 INIT_XMM sse2
 579 TRELLIS trellis_cabac_4x4, 16, 0, 0
 580 TRELLIS trellis_cabac_8x8, 64, 0, 0
 581 TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
 582 TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
 583 TRELLIS trellis_cabac_dc, 16, 1, 0
 584 TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
 585 INIT_XMM ssse3
 586 TRELLIS trellis_cabac_4x4, 16, 0, 0
 587 TRELLIS trellis_cabac_8x8, 64, 0, 0
 588 TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
 589 TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
 590 TRELLIS trellis_cabac_dc, 16, 1, 0
 591 TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
 592
 593
 594
 595 %define stack rsp+gprsize
 596 %define scoreq r14
 597 %define bitsq r13
 598 %define bitsd r13d
 599
 600 INIT_XMM
 601 %macro clocal 1
 602     ALIGN 16
 603     global mangle(x264_%1)
 604     mangle(x264_%1):
 605     %1:
 606     %assign stack_offset stack_offset_bak+gprsize
 607 %endmacro
 608
 609 %macro TRELLIS_BNODE 1 ; ctx_hi
 610 clocal trellis_bnode_%1
 611     ; int j = ctx_hi?1:0;
 612     ; trellis_node_t *bnode = &nodes_cur[j];
 613     ; while( ++j < (ctx_hi?8:4) )
 614     ;     if( nodes_cur[j].score < bnode->score )
 615     ;         bnode = &nodes_cur[j];
 616 %assign j %1
 617     mov   rax, [nodes_curq + node_score(j)]
 618     lea   rax, [rax*8 + j]
 619 %rep 3+3*%1
 620 %assign j j+1
 621     mov   r11, [nodes_curq + node_score(j)]
 622     lea   r11, [r11*8 + j]
 623     cmp   rax, r11
 624     cmova rax, r11
 625 %endrep
 626     mov   r10, dctm
 627     and   eax, 7
 628     ret
 629 %endmacro ; TRELLIS_BNODE
 630 TRELLIS_BNODE 0
 631 TRELLIS_BNODE 1
 632
 633
 634 %macro TRELLIS_COEF0 1 ; ctx_hi
 635 clocal trellis_coef0_%1
 636     ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 637     mov  r11d, [cost_siglast+0]
 638     imul  r11, lambda2q
 639     shr   r11, CABAC_SIZE_BITS - LAMBDA_BITS
 640     add   r11, [ssd+16]
 641 %if %1 == 0
 642     ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
 643     mov  scoreq, [nodes_prevq + node_score(0)]
 644     add  scoreq, [ssd]
 645     sub  scoreq, r11
 646     mov  [nodes_curq + node_score(0)], scoreq
 647 %endif
 648     ; memcpy
 649     mov  scoreq, [nodes_prevq + node_score(1)]
 650     mov  [nodes_curq + node_score(1)], scoreq
 651     mova m1, [nodes_prevq + node_score(2)]
 652     mova [nodes_curq + node_score(2)], m1
 653 %if %1
 654     mova m1, [nodes_prevq + node_score(4)]
 655     mova [nodes_curq + node_score(4)], m1
 656     mova m1, [nodes_prevq + node_score(6)]
 657     mova [nodes_curq + node_score(6)], m1
 658 %endif
 659     mov  r6d, [nodes_prevq + node_cabac_state(3)]
 660     mov  [nodes_curq + node_cabac_state(3)], r6d
 661 %if %1
 662     mova m1, [nodes_prevq + node_cabac_state(4)]
 663     mova [nodes_curq + node_cabac_state(4)], m1
 664 %endif
 665     ZERO_LEVEL_IDX %1, prev
 666     ret
 667 %endmacro ; TRELLIS_COEF0
 668 TRELLIS_COEF0 0
 669 TRELLIS_COEF0 1
 670
 671
 672
 673 %macro START_COEF 1 ; gt1
 674     ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
 675     mov  scoreq, [nodes_prevq + node_score(j)]
 676 %if j > 0
 677     test scoreq, scoreq
 678     js .ctx %+ nextj_if_invalid
 679 %endif
 680
 681     ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
 682 %if j >= 3
 683     movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
 684     movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
 685 %else
 686     movzx r6d, byte [level_statem + coeff_abs_level1_offs]
 687 %endif
 688 %if %1
 689     xor   r6d, 1
 690 %endif
 691     movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
 692
 693     ; n.score += ssd;
 694     ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
 695 %if j == 0
 696     add  scoreq, r10
 697     add  bitsd, [cost_siglast+8]
 698 %else
 699     add  scoreq, r12
 700     add  bitsd, [cost_siglast+4]
 701 %endif
 702 %endmacro ; START_COEF
 703
 704 %macro END_COEF 1
 705     ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 706     imul bitsq, lambda2q
 707     shr  bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
 708     add  scoreq, bitsq
 709
 710     ; if( n.score < nodes_cur[node_ctx].score )
 711     ;     SET_LEVEL( n, abs_level );
 712     ;     nodes_cur[node_ctx] = n;
 713     cmp scoreq, [nodes_curq + node_score(node_ctx)]
 714     jae .ctx %+ nextj_if_valid
 715     mov [nodes_curq + node_score(node_ctx)], scoreq
 716 %if j == 2 || (j <= 3 && node_ctx == 4)
 717     ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
 718     movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
 719 %elif j >= 3
 720     ; if we have updated before, then copy cabac_state from the parent node
 721     mov  r6d, [nodes_prevq + node_cabac_state(j)]
 722     mov [nodes_curq + node_cabac_state(node_ctx)], r6d
 723 %endif
 724 %if j >= 3 ; skip the transition if we're not going to reuse the context
 725     mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
 726 %endif
 727 %if %1 && node_ctx == 7
 728     mov  r6d, levelgt1_ctxm
 729     mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
 730 %endif
 731     mov  r6d, [nodes_prevq + node_level_idx(j)]
 732 %if %1
 733     mov r11d, abs_leveld
 734     shl r11d, 16
 735     or   r6d, r11d
 736 %else
 737     or   r6d, 1<<16
 738 %endif
 739     mov [level_tree + levels_usedq*4], r6d
 740     mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
 741     inc levels_usedd
 742 %endmacro ; END_COEF
 743
 744
 745
 746 %macro COEF1 2
 747     %assign j %1
 748     %assign nextj_if_valid %1+1
 749     %assign nextj_if_invalid %2
 750 %if j < 4
 751     %assign coeff_abs_level1_offs j+1
 752 %else
 753     %assign coeff_abs_level1_offs 0
 754 %endif
 755 %if j < 3
 756     %assign node_ctx j+1
 757 %else
 758     %assign node_ctx j
 759 %endif
 760 .ctx %+ j:
 761     START_COEF 0
 762     add  bitsd, 1 << CABAC_SIZE_BITS
 763     END_COEF 0
 764 %endmacro ; COEF1
 765
 766 %macro COEFN 2
 767     %assign j %1
 768     %assign nextj_if_valid %2
 769     %assign nextj_if_invalid %2
 770 %if j < 4
 771     %assign coeff_abs_level1_offs j+1
 772     %assign coeff_abs_levelgt1_offs 5
 773 %else
 774     %assign coeff_abs_level1_offs 0
 775     %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
 776 %endif
 777 %if j < 4
 778     %assign node_ctx 4
 779 %elif j < 7
 780     %assign node_ctx j+1
 781 %else
 782     %assign node_ctx 7
 783 %endif
 784 .ctx %+ j:
 785     START_COEF 1
 786     ; if( abs_level >= 15 )
 787     ;     bits += bs_size_ue_big(...)
 788     add  bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
 789     ; n.cabac_state[levelgt1_ctx]
 790 %if j == 7 ; && compiling support for 4:2:2
 791     mov    r6d, levelgt1_ctxm
 792     %define coeff_abs_levelgt1_offs r6
 793 %endif
 794 %if j == 7
 795     movzx  r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
 796 %else
 797     movzx  r10, byte [level_statem + coeff_abs_levelgt1_offs]
 798 %endif
 799     ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
 800     add   r10d, r1d
 801     movzx  r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
 802     add  bitsd, r6d
 803 %if node_ctx == 7
 804     movzx  r10, byte [cabac_transition_unary + r10-128 GLOBAL]
 805 %endif
 806     END_COEF 1
 807 %endmacro ; COEFN
 808
 809
 810
 811 clocal trellis_coef1
 812 .entry0b: ; ctx_lo, larger of the two abs_level candidates
 813     mov  r10, [ssd+8]
 814     sub  r10, r11
 815     mov  r12, [ssd+24]
 816     sub  r12, r11
 817 .entry0: ; ctx_lo, smaller of the two abs_level candidates
 818     COEF1 0, 4
 819     COEF1 1, 4
 820     COEF1 2, 4
 821     COEF1 3, 4
 822 .ctx4:
 823     rep ret
 824 .entry1b: ; ctx_hi, larger of the two abs_level candidates
 825     mov  r12, [ssd+24]
 826     sub  r12, r11
 827 .entry1: ; ctx_hi, smaller of the two abs_level candidates
 828 trellis_coef1_hi:
 829     COEF1 1, 2
 830     COEF1 2, 3
 831     COEF1 3, 4
 832     COEF1 4, 5
 833     COEF1 5, 6
 834     COEF1 6, 7
 835     COEF1 7, 8
 836 .ctx8:
 837     rep ret
 838
 839 %macro COEFN_PREFIX 1
 840     ; int prefix = X264_MIN( abs_level - 1, 14 );
 841     mov  r1d, abs_leveld
 842     cmp  abs_leveld, 15
 843     jge .level_suffix%1
 844     xor  r5d, r5d
 845 .skip_level_suffix%1:
 846     shl  r1d, 7
 847 %endmacro
 848
 849 %macro COEFN_SUFFIX 1
 850 .level_suffix%1:
 851     ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
 852     lea  r5d, [abs_levelq-14]
 853     bsr  r5d, r5d
 854     shl  r5d, CABAC_SIZE_BITS+1
 855     add  r5d, 1<<CABAC_SIZE_BITS
 856     ; int prefix = X264_MIN( abs_level - 1, 14 );
 857     mov  r1d, 15
 858     jmp .skip_level_suffix%1
 859 %endmacro
 860
 861 clocal trellis_coefn
 862 .entry0b:
 863     mov  r10, [ssd+8]
 864     mov  r12, [ssd+24]
 865     inc  abs_leveld
 866 .entry0:
 867     ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
 868     ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
 869     ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
 870     ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
 871     ; The C version has to be fully separate since C doesn't support multiple
 872     ; entrypoints. But return-on-first-failure isn't very important here (as
 873     ; opposed to coef1), so I might as well reduce codesize.
 874     COEFN_PREFIX 0
 875     COEFN 0, 1
 876     COEFN 1, 2
 877     COEFN 2, 3
 878     COEFN 3, 8
 879 .ctx8:
 880     mov zigzagq, zigzagm ; unspill since r1 was clobbered
 881     ret
 882 .entry1b:
 883     mov  r12, [ssd+24]
 884     inc  abs_leveld
 885 .entry1:
 886     COEFN_PREFIX 1
 887     COEFN 4, 5
 888     COEFN 5, 6
 889     COEFN 6, 7
 890     COEFN 7, 1
 891     jmp .ctx1
 892     COEFN_SUFFIX 0
 893     COEFN_SUFFIX 1