1 ;*****************************************************************************
2 ;* trellis-64.asm: x86_64 trellis quantization
3 ;*****************************************************************************
4 ;* Copyright (C) 2012-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at licensing@x264.com.
24 ;*****************************************************************************
26 ; This is a pretty straight-forward translation of the C code, except:
27 ; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
28 ; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
29 ; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
31 ; * Interprocedural register allocation. Eliminates argument-passing overhead
32 ; to trellis_coef* subroutines. Also reduces codesize.
34 ; Optimizations that I tried, and rejected because they were not faster:
35 ; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
36 ; Costs too much icache compared to the negligible speedup.
37 ; * There are only 21 possible sets of live node_ctxs; we could keep track of
38 ; exactly which set we're in and feed that (along with abs_level) into a jump
39 ; table instead of the switch to select a trellis_coef subroutine. This would
40 ; eliminate all branches about which node_ctxs are live, but costs either a
41 ; bunch of icache or a bunch of call/ret, and the jump table itself is
43 ; * Separate versions of trellis_coef* depending on whether we're doing the 1st
44 ; or the 2nd of the two abs_level candidates. This would eliminate some
45 ; branches about if(score is better).
46 ; * Special case more values of coef. I had a coef2 at some intermediate point
47 ; in the optimization process, but it didn't end up worthwhile in conjunction
48 ; with all the other optimizations.
49 ; * Unroll or simd writeback. I don't know why this didn't help.
52 %include "x86util.asm"
57 pd_m16: times 4 dd -16
58 pd_0123: dd 0, 1, 2, 3
59 pd_4567: dd 4, 5, 6, 7
61 pq_128: times 2 dq 128
62 pq_ffffffff: times 2 dq 0xffffffff
65 cextern cabac_transition
66 cextern cabac_size_unary
67 cextern cabac_transition_unary
68 cextern dct4_weight_tab
69 cextern dct8_weight_tab
70 cextern dct4_weight2_tab
71 cextern dct8_weight2_tab
72 cextern last_coeff_flag_offset_8x8
73 cextern significant_coeff_flag_offset_8x8
74 cextern coeff_flag_offset_chroma_422_dc
78 %define TRELLIS_SCORE_BIAS 1<<60
79 %define SIZEOF_NODE 16
80 %define CABAC_SIZE_BITS 8
83 %macro SQUARE 2 ; dst, tmp
84 ; could use pmuldq here, to eliminate the abs. but that would involve
85 ; templating a sse4 version of all of trellis, for negligible speedup.
95 pand m%1, [pq_ffffffff]
99 %macro LOAD_DUP 2 ; dst, src
108 ;-----------------------------------------------------------------------------
109 ; int trellis_cabac_4x4_psy(
110 ; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
111 ; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
112 ; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
113 ; uint64_t level_state0, uint16_t level_state1,
114 ; int b_ac, dctcoef *fenc_dct, int psy_trellis )
115 ;-----------------------------------------------------------------------------
121 %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
122 %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
124 DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
126 %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
128 %define level_statem rsp+stack_offset+32
130 %define b_acm r11m ; 4x4 only
131 %define b_interlacedm r11m ; 8x8 only
132 %define i_coefsm1 r11m ; dc only
133 %define fenc_dctm r12m
134 %define psy_trellism r13m
136 shl dword b_interlacedm, 6
137 %define dct_weight1_tab dct8_weight_tab
138 %define dct_weight2_tab dct8_weight2_tab
140 %define dct_weight1_tab dct4_weight_tab
141 %define dct_weight2_tab dct4_weight2_tab
145 %define last_nnzm [stack+0]
146 %define zigzagm [stack+8]
150 %define orig_coefsm [stack+16]
151 %define quant_coefsm [stack+24]
152 mov orig_coefsm, orig_coefsq
153 mov quant_coefsm, quant_coefsq
155 %define unquant_mfm [stack+32]
156 %define levelgt1_ctxm [stack+40]
158 %define cost_siglast stack+80
159 %define level_tree stack+96
161 ; trellis_node_t is layed out differently than C.
162 ; struct-of-arrays rather than array-of-structs, for simd.
163 %define nodes_curq r7
164 %define nodes_prevq r8
165 %define node_score(x) x*8
166 %define node_level_idx(x) 64+x*4
167 %define node_cabac_state(x) 96+x*4
168 lea nodes_curq, [level_tree + level_tree_size]
169 lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
170 mov r6, TRELLIS_SCORE_BIAS
171 mov [nodes_curq + node_score(0)], r6
172 mov dword [nodes_curq + node_level_idx(0)], 0
173 movd mm0, [level_statem + 0]
174 punpcklbw mm0, [level_statem + 4]
175 punpcklwd mm0, [level_statem + 8]
176 %define level_state_packed mm0 ; version for copying into node.cabac_state
177 pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
178 movq [nodes_curq + node_score(1)], m7
179 mova [nodes_curq + node_score(2)], m7
181 %define levels_usedq r4
182 %define levels_usedd r4d
183 mov dword [level_tree], 0
186 %define abs_levelq r9
187 %define abs_leveld r9d
188 %define abs_coefq r14
193 mov dword levelgt1_ctxm, 8
195 mov dword levelgt1_ctxm, 9
198 LOAD_DUP m6, psy_trellism
199 %define psy_trellis m6
201 LOAD_DUP m6, [unquant_mfq]
203 %define unquant_mf m6
207 mov unquant_mfm, unquant_mfq
209 ; Keep a single offset register to PICify all global constants.
210 ; They're all relative to "beginning of this asm file's .text section",
211 ; even tables that aren't in this file.
212 ; (Any address in .text would work, this one was just convenient.)
214 %define GLOBAL +r0-$$
219 TRELLIS_LOOP 0 ; node_ctx 0..3
220 TRELLIS_LOOP 1 ; node_ctx 1..7
223 ; int level = bnode->level_idx;
224 ; for( int i = b_ac; i <= last_nnz; i++ )
225 ; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
226 ; level = level_tree[level].next;
230 %if num_coefs == 16 && dc == 0
235 mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
237 movzx r2, byte [zigzagq + iiq]
239 movd m0, [level_tree + r0*4]
240 movzx r0, word [level_tree + r0*4]
242 movd m1, [dctq + r2*SIZEOF_DCTCOEF]
245 movd [dctq + r2*SIZEOF_DCTCOEF], m0
249 mov [dctq + r2*SIZEOF_DCTCOEF], r4w
252 mov r5d, [level_tree + r0*4]
254 mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
256 movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
264 mov [dctq + r2*SIZEOF_DCTCOEF], r5d
266 mov [dctq + r2*SIZEOF_DCTCOEF], r5w
277 %if num_coefs == 16 && dc == 0
292 %macro TRELLIS_LOOP 1 ; ctx_hi
294 ; if( !quant_coefs[i] )
297 mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
299 movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
302 ; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
303 ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
304 mov r10, cabac_state_sigm
306 mov r6d, b_interlacedm
309 movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
311 movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
313 movzx r10, byte [r10 + r6]
315 movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
316 movzx r10, byte [r10 + r13]
318 movzx r10, byte [r10 + iiq]
321 test abs_leveld, abs_leveld
322 jnz %%.nonzero_quant_coef
325 ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
326 ; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
327 ; nodes_cur[0].score -= cost_sig0;
328 movzx r10, word [cabac_entropy + r10*2 GLOBAL]
330 shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
331 sub [nodes_curq + node_score(0)], r10
333 ZERO_LEVEL_IDX %1, cur
336 %%.nonzero_quant_coef:
337 ; int sign_coef = orig_coefs[zigzag[i]];
338 ; int abs_coef = abs( sign_coef );
339 ; int q = abs( quant_coefs[i] );
340 movzx zigzagid, byte [zigzagq+iiq]
344 LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
346 LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
347 psrad m1, 16 ; sign_coef
349 punpcklqdq m0, m0 ; quant_coef
352 pabsd m2, m1 ; abs_coef
355 pcmpgtd m8, m1 ; sign_mask
361 psubd m0, [sq_1] ; abs_level
364 xchg nodes_curq, nodes_prevq
366 ; if( i < num_coefs-1 )
367 ; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
368 ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
369 ; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
370 ; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
371 ; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
372 ; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
374 %if dc && num_coefs != 8
381 movzx r11, word [cabac_entropy + r10*2 GLOBAL]
383 movzx r12, word [cabac_entropy + r10*2 GLOBAL]
384 mov [cost_siglast+0], r11d
385 mov r10, cabac_state_lastm
387 movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
388 movzx r10, byte [r10 + r6]
390 movzx r10, byte [r10 + r13]
392 movzx r10, byte [r10 + iiq]
394 movzx r11, word [cabac_entropy + r10*2 GLOBAL]
396 mov [cost_siglast+4], r11d
399 movzx r10, word [cabac_entropy + r10*2 GLOBAL]
401 mov [cost_siglast+8], r10d
405 ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
406 ; int d = abs_coef - unquant_abs_level;
407 ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
409 pmuludq m0, unquant_mf
413 LOAD_DUP m3, [r10 + zigzagiq*4]
415 LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
420 psrld m0, 8 ; unquant_abs_level
429 LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
436 ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
437 ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
438 ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
439 ; ssd1[k] -= psy_weight * psy_value;
442 LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
444 LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
445 psrad m3, 16 ; orig_coef
448 psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
452 psubd m3, m1 ; predicted_coef
460 LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
461 pmuludq m1, psy_trellis
472 %if dc == 0 && %1 == 0
474 jnz %%.skip_dc_rounding
476 ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
477 ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
478 ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
479 psrad m1, 31 ; sign_coef>>31
482 pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
491 %assign stack_offset_bak stack_offset
495 mov r10, [ssd] ; trellis_coef* args
498 ; for( int j = 0; j < 8; j++ )
499 ; nodes_cur[j].score = TRELLIS_SCORE_MAX;
501 mova [nodes_curq + node_score(0)], m7
502 mova [nodes_curq + node_score(2)], m7
503 %else ; avoid store-forwarding stalls on k8/k10
505 movq [nodes_curq + node_score(0)], m7
507 movq [nodes_curq + node_score(1)], m7
508 movq [nodes_curq + node_score(2)], m7
509 movq [nodes_curq + node_score(3)], m7
511 mova [nodes_curq + node_score(4)], m7
512 mova [nodes_curq + node_score(6)], m7
515 call trellis_coefn.entry%1
516 call trellis_coefn.entry%1b
519 call trellis_coef1.entry%1
520 call trellis_coefn.entry%1b
523 call trellis_coef0_%1
524 call trellis_coef1.entry%1b
528 %if num_coefs == 16 && dc == 0
533 call trellis_bnode_%1
535 %if num_coefs == 16 && dc == 0
544 mov [cost_siglast+0], r6
545 mov [cost_siglast+8], r6d
548 %endmacro ; TRELLIS_LOOP
550 ; just a synonym for %if
557 %macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
558 ; for( int j = 0; j < 8; j++ )
559 ; nodes_cur[j].level_idx = levels_used;
560 ; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
563 and levels_usedd, ~3 ; allow aligned stores
564 movd m0, levels_usedd
568 IF%1 paddd m1, [pd_4567]
569 mova m2, [nodes_%2q + node_level_idx(0)]
570 IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
571 mova [nodes_curq + node_level_idx(0)], m0
572 IF%1 mova [nodes_curq + node_level_idx(4)], m1
573 mova [level_tree + (levels_usedq+0)*4], m2
574 IF%1 mova [level_tree + (levels_usedq+4)*4], m3
575 add levels_usedd, (1+%1)*4
579 TRELLIS trellis_cabac_4x4, 16, 0, 0
580 TRELLIS trellis_cabac_8x8, 64, 0, 0
581 TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
582 TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
583 TRELLIS trellis_cabac_dc, 16, 1, 0
584 TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
586 TRELLIS trellis_cabac_4x4, 16, 0, 0
587 TRELLIS trellis_cabac_8x8, 64, 0, 0
588 TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
589 TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
590 TRELLIS trellis_cabac_dc, 16, 1, 0
591 TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
595 %define stack rsp+gprsize
603 global mangle(x264_%1)
606 %assign stack_offset stack_offset_bak+gprsize
609 %macro TRELLIS_BNODE 1 ; ctx_hi
610 clocal trellis_bnode_%1
611 ; int j = ctx_hi?1:0;
612 ; trellis_node_t *bnode = &nodes_cur[j];
613 ; while( ++j < (ctx_hi?8:4) )
614 ; if( nodes_cur[j].score < bnode->score )
615 ; bnode = &nodes_cur[j];
617 mov rax, [nodes_curq + node_score(j)]
621 mov r11, [nodes_curq + node_score(j)]
629 %endmacro ; TRELLIS_BNODE
634 %macro TRELLIS_COEF0 1 ; ctx_hi
635 clocal trellis_coef0_%1
636 ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
637 mov r11d, [cost_siglast+0]
639 shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
642 ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
643 mov scoreq, [nodes_prevq + node_score(0)]
646 mov [nodes_curq + node_score(0)], scoreq
649 mov scoreq, [nodes_prevq + node_score(1)]
650 mov [nodes_curq + node_score(1)], scoreq
651 mova m1, [nodes_prevq + node_score(2)]
652 mova [nodes_curq + node_score(2)], m1
654 mova m1, [nodes_prevq + node_score(4)]
655 mova [nodes_curq + node_score(4)], m1
656 mova m1, [nodes_prevq + node_score(6)]
657 mova [nodes_curq + node_score(6)], m1
659 mov r6d, [nodes_prevq + node_cabac_state(3)]
660 mov [nodes_curq + node_cabac_state(3)], r6d
662 mova m1, [nodes_prevq + node_cabac_state(4)]
663 mova [nodes_curq + node_cabac_state(4)], m1
665 ZERO_LEVEL_IDX %1, prev
667 %endmacro ; TRELLIS_COEF0
673 %macro START_COEF 1 ; gt1
674 ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
675 mov scoreq, [nodes_prevq + node_score(j)]
678 js .ctx %+ nextj_if_invalid
681 ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
683 movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
684 movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
686 movzx r6d, byte [level_statem + coeff_abs_level1_offs]
691 movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
694 ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
697 add bitsd, [cost_siglast+8]
700 add bitsd, [cost_siglast+4]
702 %endmacro ; START_COEF
705 ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
707 shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
710 ; if( n.score < nodes_cur[node_ctx].score )
711 ; SET_LEVEL( n, abs_level );
712 ; nodes_cur[node_ctx] = n;
713 cmp scoreq, [nodes_curq + node_score(node_ctx)]
714 jae .ctx %+ nextj_if_valid
715 mov [nodes_curq + node_score(node_ctx)], scoreq
716 %if j == 2 || (j <= 3 && node_ctx == 4)
717 ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
718 movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
720 ; if we have updated before, then copy cabac_state from the parent node
721 mov r6d, [nodes_prevq + node_cabac_state(j)]
722 mov [nodes_curq + node_cabac_state(node_ctx)], r6d
724 %if j >= 3 ; skip the transition if we're not going to reuse the context
725 mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
727 %if %1 && node_ctx == 7
728 mov r6d, levelgt1_ctxm
729 mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
731 mov r6d, [nodes_prevq + node_level_idx(j)]
739 mov [level_tree + levels_usedq*4], r6d
740 mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
748 %assign nextj_if_valid %1+1
749 %assign nextj_if_invalid %2
751 %assign coeff_abs_level1_offs j+1
753 %assign coeff_abs_level1_offs 0
762 add bitsd, 1 << CABAC_SIZE_BITS
768 %assign nextj_if_valid %2
769 %assign nextj_if_invalid %2
771 %assign coeff_abs_level1_offs j+1
772 %assign coeff_abs_levelgt1_offs 5
774 %assign coeff_abs_level1_offs 0
775 %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
786 ; if( abs_level >= 15 )
787 ; bits += bs_size_ue_big(...)
788 add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
789 ; n.cabac_state[levelgt1_ctx]
790 %if j == 7 ; && compiling support for 4:2:2
791 mov r6d, levelgt1_ctxm
792 %define coeff_abs_levelgt1_offs r6
795 movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
797 movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
799 ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
801 movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
804 movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
812 .entry0b: ; ctx_lo, larger of the two abs_level candidates
817 .entry0: ; ctx_lo, smaller of the two abs_level candidates
824 .entry1b: ; ctx_hi, larger of the two abs_level candidates
827 .entry1: ; ctx_hi, smaller of the two abs_level candidates
839 %macro COEFN_PREFIX 1
840 ; int prefix = X264_MIN( abs_level - 1, 14 );
845 .skip_level_suffix%1:
849 %macro COEFN_SUFFIX 1
851 ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
852 lea r5d, [abs_levelq-14]
854 shl r5d, CABAC_SIZE_BITS+1
855 add r5d, 1<<CABAC_SIZE_BITS
856 ; int prefix = X264_MIN( abs_level - 1, 14 );
858 jmp .skip_level_suffix%1
867 ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
868 ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
869 ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
870 ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
871 ; The C version has to be fully separate since C doesn't support multiple
872 ; entrypoints. But return-on-first-failure isn't very important here (as
873 ; opposed to coef1), so I might as well reduce codesize.
880 mov zigzagq, zigzagm ; unspill since r1 was clobbered