;*****************************************************************************
-;* cabac-a.asm: h264 encoder library
+;* cabac-a.asm: x86 cabac
;*****************************************************************************
-;* Copyright (C) 2008 x264 project
+;* Copyright (C) 2008-2010 x264 project
;*
-;* Author: Loren Merritt <lorenm@u.washington.edu>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Fiona Glaser <fiona@x264.com>
+;* Holger Lubitz <holger@lubitz.org>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
-SECTION_RODATA
-
SECTION .text
-cextern x264_cabac_range_lps
-cextern x264_cabac_transition
-cextern x264_cabac_renorm_shift
-
-%macro DEF_TMP 16
- %rep 8
- %define t%1d r%9d
- %define t%1b r%9b
- %define t%1 r%9
- %rotate 1
- %endrep
-%endmacro
+cextern cabac_range_lps
+cextern cabac_transition
+cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
-%ifdef ARCH_X86_64
- DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
+%ifdef WIN64
+ DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
+ %define pointer resq
+%elifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
%define pointer resq
%else
- DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
+ DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
%define pointer resd
%endif
endstruc
%macro LOAD_GLOBAL 4
-%ifdef PIC64
+%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- lea r11, [%2 GLOBAL]
+ lea r11, [%2]
%ifnidn %3, 0
add r11, %3
%endif
movzx %1, byte [r11+%4]
-%elifdef PIC32
- %ifnidn %3, 0
- lea %1, [%3+%4]
- movzx %1, byte [%2+%1 GLOBAL]
- %else
- movzx %1, byte [%2+%3+%4 GLOBAL]
- %endif
%else
movzx %1, byte [%2+%3+%4]
%endif
%endmacro
-cglobal x264_cabac_encode_decision_asm, 0,7
- movifnidn t0d, r0m
+cglobal cabac_encode_decision_asm, 0,7
+ movifnidn t0, r0mp
movifnidn t1d, r1m
- picgetgot t2
- mov t5d, [r0+cb.range]
- movzx t3d, byte [r0+cb.state+t1]
- mov t4d, t5d
+ mov t5d, [t0+cb.range]
+ movzx t4d, byte [t0+cb.state+t1]
+ mov t3d, t5d
+ mov t6d, t4d
shr t5d, 6
- and t5d, 3
- LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
- sub t4d, t5d
- mov t6d, t3d
- shr t6d, 6
-%ifdef PIC32
- cmp t6d, r2m
-%else
+ shr t4d, 1
movifnidn t2d, r2m
+ LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
+ LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
+ and t6d, 1
+ sub t3d, t5d
cmp t6d, t2d
-%endif
- mov t6d, [r0+cb.low]
- lea t7, [t6+t4]
- cmovne t4d, t5d
+ mov t6d, [t0+cb.low]
+ lea t7, [t6+t3]
+ cmovne t3d, t5d
cmovne t6d, t7d
-%ifdef PIC32
- mov t1, r2m
- LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
-%else
- LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
-%endif
- movifnidn t1d, r1m
- mov [r0+cb.state+t1], t3b
-.renorm:
- mov t3d, t4d
+ mov [t0+cb.state+t1], t4b
+;cabac_encode_renorm
+ mov t4d, t3d
shr t3d, 3
- LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+ LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
- add t3d, [r0+cb.queue]
- mov [r0+cb.range], t4d
- mov [r0+cb.low], t6d
- mov [r0+cb.queue], t3d
- cmp t3d, 8
+ add t3d, [t0+cb.queue]
+ mov [t0+cb.range], t4d
+ jge cabac_putbyte
+.update_queue_low:
+ mov [t0+cb.low], t6d
+ mov [t0+cb.queue], t3d
+ RET
+
+cglobal cabac_encode_bypass_asm, 0,3
+ movifnidn t0, r0mp
+ movifnidn t3d, r1m
+ neg t3d
+ mov t8d, [t0+cb.low]
+ and t3d, [t0+cb.range]
+ lea t8d, [t8*2+t3]
+ mov t3d, [t0+cb.queue]
+ inc t3d
+%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
+ jge cabac_putbyte
+%else
+ jge .putbyte
+%endif
+ mov [t0+cb.low], t8d
+ mov [t0+cb.queue], t3d
+ RET
+.putbyte:
+ PROLOGUE 0,7
+ movifnidn t6d, t8d
+ jmp cabac_putbyte
+
+cglobal cabac_encode_terminal_asm, 0,3
+ movifnidn t0, r0mp
+ sub dword [t0+cb.range], 2
+; shortcut: the renormalization shift in terminal
+; can only be 0 or 1 and is zero over 99% of the time.
+ test dword [t0+cb.range], 0x100
+ je .renorm
+ REP_RET
+.renorm:
+ shl dword [t0+cb.low], 1
+ shl dword [t0+cb.range], 1
+ inc dword [t0+cb.queue]
jge .putbyte
REP_RET
.putbyte:
+ PROLOGUE 0,7
+ mov t3d, [t0+cb.queue]
+ mov t6d, [t0+cb.low]
+ jmp cabac_putbyte
+
+cabac_putbyte:
; alive: t0=cb t3=queue t6=low
- add t3d, 2
- mov t1d, 1
+%ifdef WIN64
+ DECLARE_REG_TMP 3,4,1,0,2,5,6,10
+%endif
+ mov t1d, -1
+ add t3d, 10
mov t2d, t6d
shl t1d, t3b
shr t2d, t3b ; out
- dec t1d
- sub t3d, 10
+ not t1d
+ sub t3d, 18
and t6d, t1d
+ mov t5d, [t0+cb.bytes_outstanding]
cmp t2b, 0xff ; FIXME is a 32bit op faster?
- mov [r0+cb.queue], t3d
- mov [r0+cb.low], t6d
- mov t1d, t2d
- mov t4, [r0+cb.p]
- je .postpone
- mov t5d, [r0+cb.bytes_outstanding]
- shr t1d, 8 ; carry
- add [t4-1], t1b
- test t5d, t5d
- jz .no_outstanding
- dec t1d
+ jz .postpone
+ mov t1, [t0+cb.p]
+ add [t1-1], dh ; t2h
+ dec dh
.loop_outstanding:
- mov [t4], t1b
- inc t4
+ mov [t1], dh
+ inc t1
dec t5d
- jg .loop_outstanding
-.no_outstanding:
- mov [t4], t2b
- inc t4
- mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
- mov [r0+cb.p], t4
- RET
+ jge .loop_outstanding
+ mov [t1-1], t2b
+ mov [t0+cb.p], t1
.postpone:
- inc dword [r0+cb.bytes_outstanding]
- RET
-
+ inc t5d
+ mov [t0+cb.bytes_outstanding], t5d
+ jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)