1 ;*****************************************************************************
2 ;* cabac-a.asm: x86 cabac
3 ;*****************************************************************************
4 ;* Copyright (C) 2008-2015 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
33 coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
34 coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
35 coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
36 db 4, 4, 4, 4, 5, 6, 7, 7
39 %macro COEFF_LAST_TABLE 17
45 dq mangle(x264_coeff_last%4_ %+ funccpu1)
47 dq mangle(x264_coeff_last%4_ %+ funccpu2)
49 dq mangle(x264_coeff_last%4_ %+ funccpu3)
55 cextern coeff_last4_mmx2
56 cextern coeff_last4_mmx2_lzcnt
57 cextern coeff_last15_sse2
58 cextern coeff_last15_sse2_lzcnt
59 cextern coeff_last16_sse2
60 cextern coeff_last16_sse2_lzcnt
61 cextern coeff_last64_sse2
62 cextern coeff_last64_sse2_lzcnt
63 cextern coeff_last64_avx2_lzcnt
68 coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
69 coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
70 coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
75 cextern cabac_range_lps
76 cextern cabac_transition
77 cextern cabac_renorm_shift
79 cextern cabac_size_unary
80 cextern cabac_transition_unary
81 cextern significant_coeff_flag_offset
82 cextern significant_coeff_flag_offset_8x8
83 cextern last_coeff_flag_offset
84 cextern last_coeff_flag_offset_8x8
85 cextern coeff_abs_level_m1_offset
87 cextern cabac_encode_ue_bypass
99 .bytes_outstanding: resd 1
104 .bits_encoded: resd 1
108 %macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
111 movzx %1, byte [%2+%3+r7-$$]
114 movzx %1, byte [%2+%3+%5-$$]
117 movzx %1, byte [%2+%3+%4]
122 ; t3 must be ecx, since it's used for shift.
124 DECLARE_REG_TMP 3,1,2,0,5,6,4,4
126 DECLARE_REG_TMP 0,1,2,3,4,5,6,6
128 DECLARE_REG_TMP 0,4,2,1,3,5,6,2
131 cglobal cabac_encode_decision_%1, 1,7
133 mov t5d, [r0+cb.range]
134 movzx t6d, byte [r0+cb.state+t1]
135 movifnidn t0, r0 ; WIN64
147 LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
148 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
156 mov [t0+cb.state+t1], t4b
166 LOAD_GLOBAL t3d, cabac_renorm_shift, t3
173 mov [t0+cb.range], t4d
174 add t3d, [t0+cb.queue]
178 mov [t0+cb.queue], t3d
181 cglobal cabac_encode_bypass_%1, 2,3
183 and r1d, [r0+cb.range]
185 movifnidn t0, r0 ; WIN64
186 mov t3d, [r0+cb.queue]
188 %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
194 mov [t0+cb.queue], t3d
204 cglobal cabac_encode_terminal_%1, 1,3
205 sub dword [r0+cb.range], 2
206 ; shortcut: the renormalization shift in terminal
207 ; can only be 0 or 1 and is zero over 99% of the time.
208 test dword [r0+cb.range], 0x100
212 shl dword [r0+cb.low], 1
213 shl dword [r0+cb.range], 1
214 inc dword [r0+cb.queue]
219 movifnidn t0, r0 ; WIN64
220 mov t3d, [r0+cb.queue]
225 ; alive: t0=cb t3=queue t6=low
227 DECLARE_REG_TMP 3,6,1,0,2,5,4
244 mov t5d, [t0+cb.bytes_outstanding]
245 cmp t2b, 0xff ; FIXME is a 32bit op faster?
254 jge .loop_outstanding
259 mov [t0+cb.bytes_outstanding], t5d
260 jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
267 ; %2 = node_ctx init?
268 %macro COEFF_ABS_LEVEL_GT1 2
272 movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
275 movzx r9d, byte [r8+ctx]
276 ; if( coeff_abs > 1 )
279 ; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
280 movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
281 movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
287 movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
292 ; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
293 movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
295 movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
301 movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
304 ; if( coeff_abs < 15 )
308 ; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
309 movzx r9d, byte [r8+ctx]
311 movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
312 ; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
313 movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
319 ; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
320 movzx r9d, byte [r8+ctx]
321 movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
322 ; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
323 movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
333 ; bs_size_ue_big(coeff_abs-15)<<8
335 ; (ilog2(coeff_abs-14)+1) << 8
341 movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
346 %macro LOAD_DCTCOEF 1
350 movzx %1, word [dct+r6*2]
354 %macro ABS_DCTCOEFS 2
358 ABSD m0, [%1+ 0+i*64], m4
359 ABSD m1, [%1+16+i*64], m5
360 ABSD m2, [%1+32+i*64], m4
361 ABSD m3, [%1+48+i*64], m5
362 mova [rsp+ 0+i*64], m0
363 mova [rsp+16+i*64], m1
364 mova [rsp+32+i*64], m2
365 mova [rsp+48+i*64], m3
367 ABSW m0, [%1+ 0+i*32], m2
368 ABSW m1, [%1+16+i*32], m3
369 mova [rsp+ 0+i*32], m0
370 mova [rsp+16+i*32], m1
378 movzx r11d, byte [r4+r6]
384 movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
388 ;-----------------------------------------------------------------------------
389 ; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
390 ; int ctx_block_cat, x264_cabac_t *cb );
391 ;-----------------------------------------------------------------------------
394 %macro CABAC_RESIDUAL_RD 2
396 %define func cabac_block_residual_8x8_rd_internal
400 %define func cabac_block_residual_rd_internal
408 %define GLOBAL +r12-$$
414 %assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
416 shl r1d, 4 ; MB_INTERLACED*16
418 lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
421 movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
422 movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
423 movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
425 ; abs() all the coefficients; copy them to the stack to avoid
426 ; changing the originals.
427 ; overreading is okay; it's all valid aligned data anyways.
431 mov r4, r0 ; r4 = dct
432 mov r6, ~SIZEOF_DCTCOEF
433 and r6, r4 ; handle AC coefficient case
435 sub r4, r6 ; calculate our new dct pointer
436 add r4, rsp ; restore AC coefficient offset
438 mov r1, [%2+gprsize*r2 GLOBAL]
439 ; for improved OOE performance, run coeff_last on the original coefficients.
440 call r1 ; coeff_last[ctx_block_cat]( dct )
441 ; we know on 64-bit that the SSE2 versions of this function only
442 ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
443 ; don't need r2 in 8x8 mode.
444 mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
445 ; pre-add some values to simplify addressing
449 add r8, r3 ; precalculate cabac state pointers
451 ; if( last != count_cat_m1[ctx_block_cat] )
455 cmp r6b, [count_cat_m1+r2 GLOBAL]
459 ; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
460 ; so we'll use r11 for this.
462 %define siglast_ctx r11
464 %define siglast_ctx r6
467 ; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
468 ; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
470 movzx r1d, byte [r5+siglast_ctx]
471 movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
473 movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
474 mov [r5+siglast_ctx], r9b
478 movzx r1d, byte [r7+siglast_ctx]
479 movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
481 movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
482 mov [r7+siglast_ctx], r9b
486 COEFF_ABS_LEVEL_GT1 last, 1
487 ; for( int i = last-1 ; i >= 0; i-- )
494 movzx r9d, byte [r5+siglast_ctx]
497 ; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
498 movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
499 movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
500 mov [r5+siglast_ctx], r10b
506 ; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
507 movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
509 movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
510 mov [r5+siglast_ctx], r10b
512 ; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
514 movzx r9d, byte [r7+siglast_ctx]
515 movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
516 movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
517 mov [r7+siglast_ctx], r10b
519 COEFF_ABS_LEVEL_GT1 coeff, 0
523 mov [r3+cb.bits_encoded-cb.state], r0d
530 CABAC_RESIDUAL_RD 0, coeff_last_sse2
531 CABAC_RESIDUAL_RD 1, coeff_last_sse2
533 CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
534 CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
536 CABAC_RESIDUAL_RD 0, coeff_last_sse2
537 CABAC_RESIDUAL_RD 1, coeff_last_sse2
539 CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
540 CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
543 ;-----------------------------------------------------------------------------
544 ; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
545 ; int ctx_block_cat, x264_cabac_t *cb );
546 ;-----------------------------------------------------------------------------
550 call cabac_encode_decision_bmi2
552 call cabac_encode_decision_asm
554 %if WIN64 ; move cabac back
563 %macro SIGMAP_LOOP 3-4
568 movsx %2, word [dct+r10*2]
571 movzx r1d, byte [sigoff_8x8 + r10]
574 lea r1d, [sigoffd + r10d]
577 jz .sigmap_%4zero ; if( l[i] )
579 mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
581 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
583 movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
586 lea r1d, [lastoffd + r10d]
588 cmp r10d, lastm ; if( i == last )
591 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
592 jmp .sigmap_%4loop_endcheck
595 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
596 .sigmap_%4loop_endcheck:
599 jne .sigmap_%4loop ; if( ++i == count_m1 )
603 movsx %2, word [dct+r10*2]
606 mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
608 .sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
613 jmp .level_loop_start
617 %macro CABAC_RESIDUAL 1
618 cglobal cabac_block_residual_internal, 4,15
620 ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
622 %define lastm [rsp+4*1]
623 %define GLOBAL +r7-$$
628 %assign pad gprsize+4*2+4*64-(stack_offset&15)
636 %define leveloffq r10
637 %define leveloffd r10d
638 %define leveloffm [rsp+4*0]
639 %define countcatd r11d
640 %define sigoff_8x8 r12
641 %define coeffidxq r13
642 %define coeffidxd r13d
644 %define coeffs rsp+4*2
646 lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
648 movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
649 movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
650 movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
651 movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
654 mov leveloffm, leveloffd
656 mov r1, [%1+gprsize*r2 GLOBAL]
659 ; put cabac in r0; needed for cabac_encode_decision
665 SIGMAP_LOOP 0, r12d, countcatd,
667 SIGMAP_LOOP 1, r11d, 63, _8x8
669 ; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
672 mov leveloffd, leveloffm
673 xor nodectxd, nodectxd
675 mov r9d, [coeffs+coeffidxq*4]
679 movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
686 movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
691 movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
715 ; we could avoid this if we implemented it in asm, but I don't feel like that
721 sub rsp, 32 ; shadow space
723 call cabac_encode_ue_bypass
732 movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
736 call cabac_encode_bypass_bmi2
738 call cabac_encode_bypass_asm
751 CABAC_RESIDUAL coeff_last_sse2
753 CABAC_RESIDUAL coeff_last_sse2_lzcnt
755 CABAC_RESIDUAL coeff_last_avx2_lzcnt