1 ;*****************************************************************************
2 ;* cabac-a.asm: x86 cabac
3 ;*****************************************************************************
4 ;* Copyright (C) 2008-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
32 cextern cabac_range_lps
33 cextern cabac_transition
34 cextern cabac_renorm_shift
36 ; t3 must be ecx, since it's used for shift.
38 DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
41 DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
44 DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
52 .bytes_outstanding: resd 1
63 ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
68 movzx %1, byte [r11+%4]
70 movzx %1, byte [%2+%3+%4]
74 cglobal cabac_encode_decision_asm, 0,7
77 mov t5d, [t0+cb.range]
78 movzx t4d, byte [t0+cb.state+t1]
84 LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
85 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
93 mov [t0+cb.state+t1], t4b
97 LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
100 add t3d, [t0+cb.queue]
101 mov [t0+cb.range], t4d
105 mov [t0+cb.queue], t3d
108 cglobal cabac_encode_bypass_asm, 0,3
113 and t3d, [t0+cb.range]
115 mov t3d, [t0+cb.queue]
117 %ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
123 mov [t0+cb.queue], t3d
130 cglobal cabac_encode_terminal_asm, 0,3
132 sub dword [t0+cb.range], 2
133 ; shortcut: the renormalization shift in terminal
134 ; can only be 0 or 1 and is zero over 99% of the time.
135 test dword [t0+cb.range], 0x100
139 shl dword [t0+cb.low], 1
140 shl dword [t0+cb.range], 1
141 inc dword [t0+cb.queue]
146 mov t3d, [t0+cb.queue]
151 ; alive: t0=cb t3=queue t6=low
153 DECLARE_REG_TMP 3,4,1,0,2,5,6,10
163 mov t5d, [t0+cb.bytes_outstanding]
164 cmp t2b, 0xff ; FIXME is a 32bit op faster?
173 jge .loop_outstanding
178 mov [t0+cb.bytes_outstanding], t5d
179 jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)