1 ;*****************************************************************************
2 ;* cabac-a.asm: x86 cabac
3 ;*****************************************************************************
4 ;* Copyright (C) 2008-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
32 cextern cabac_range_lps
33 cextern cabac_transition
34 cextern cabac_renorm_shift
36 ; t3 must be ecx, since it's used for shift.
38 DECLARE_REG_TMP 3,1,2,0,6,5,4,2
41 DECLARE_REG_TMP 0,1,2,3,4,5,6,6
44 DECLARE_REG_TMP 0,4,2,1,3,5,6,2
52 .bytes_outstanding: resd 1
63 ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
68 movzx %1, byte [r7+%4]
70 movzx %1, byte [%2+%3+%4]
74 cglobal cabac_encode_decision_asm, 0,7
77 mov t5d, [t0+cb.range]
78 movzx t6d, byte [t0+cb.state+t1]
87 LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
88 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
96 mov [t0+cb.state+t1], t4b
100 LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
106 mov [t0+cb.range], t4d
107 add t3d, [t0+cb.queue]
111 mov [t0+cb.queue], t3d
114 cglobal cabac_encode_bypass_asm, 0,3
118 and t3d, [t0+cb.range]
120 mov t3d, [t0+cb.queue]
122 %ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
128 mov [t0+cb.queue], t3d
135 cglobal cabac_encode_terminal_asm, 0,3
137 sub dword [t0+cb.range], 2
138 ; shortcut: the renormalization shift in terminal
139 ; can only be 0 or 1 and is zero over 99% of the time.
140 test dword [t0+cb.range], 0x100
144 shl dword [t0+cb.low], 1
145 shl dword [t0+cb.range], 1
146 inc dword [t0+cb.queue]
151 mov t3d, [t0+cb.queue]
155 ; alive: t0=cb t3=queue t6=low
157 DECLARE_REG_TMP 3,6,1,0,2,5,4
167 mov t5d, [t0+cb.bytes_outstanding]
168 cmp t2b, 0xff ; FIXME is a 32bit op faster?
177 jge .loop_outstanding
182 mov [t0+cb.bytes_outstanding], t5d
183 jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)