1 ;*****************************************************************************
2 ;* cabac-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2008 x264 project
6 ;* Author: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
29 cextern cabac_range_lps
30 cextern cabac_transition
31 cextern cabac_renorm_shift
33 ; t3 must be ecx, since it's used for shift.
35 DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
38 DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
41 DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
49 .bytes_outstanding: resd 1
60 ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
65 movzx %1, byte [r11+%4]
67 movzx %1, byte [%2+%3+%4]
71 cglobal cabac_encode_decision_asm, 0,7
74 mov t5d, [t0+cb.range]
75 movzx t4d, byte [t0+cb.state+t1]
81 LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
82 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
90 mov [t0+cb.state+t1], t4b
94 LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
97 add t3d, [t0+cb.queue]
98 mov [t0+cb.range], t4d
102 mov [t0+cb.queue], t3d
105 cglobal cabac_encode_bypass_asm, 0,3
110 and t3d, [t0+cb.range]
112 mov t3d, [t0+cb.queue]
114 %ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
120 mov [t0+cb.queue], t3d
127 cglobal cabac_encode_terminal_asm, 0,3
129 sub dword [t0+cb.range], 2
130 ; shortcut: the renormalization shift in terminal
131 ; can only be 0 or 1 and is zero over 99% of the time.
132 test dword [t0+cb.range], 0x100
136 shl dword [t0+cb.low], 1
137 shl dword [t0+cb.range], 1
138 inc dword [t0+cb.queue]
143 mov t3d, [t0+cb.queue]
148 ; alive: t0=cb t3=queue t6=low
150 DECLARE_REG_TMP 3,4,1,0,2,5,6,10
160 mov t5d, [t0+cb.bytes_outstanding]
161 cmp t2b, 0xff ; FIXME is a 32bit op faster?
170 jge .loop_outstanding
175 mov [t0+cb.bytes_outstanding], t5d
176 jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)