git.sesse.net Git - x264/blob - common/x86/cabac-a.asm

   1 ;*****************************************************************************
   2 ;* cabac-a.asm: x86 cabac
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2008-2011 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*          Fiona Glaser <fiona@x264.com>
   8 ;*          Holger Lubitz <holger@lubitz.org>
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23 ;*
  24 ;* This program is also available under a commercial proprietary license.
  25 ;* For more information, contact us at licensing@x264.com.
  26 ;*****************************************************************************
  27
  28 %include "x86inc.asm"
  29
  30 SECTION .text
  31
  32 cextern cabac_range_lps
  33 cextern cabac_transition
  34 cextern cabac_renorm_shift
  35
  36 ; t3 must be ecx, since it's used for shift.
  37 %ifdef WIN64
  38     DECLARE_REG_TMP 3,1,2,0,6,5,4,2
  39     %define pointer resq
  40 %elifdef ARCH_X86_64
  41     DECLARE_REG_TMP 0,1,2,3,4,5,6,6
  42     %define pointer resq
  43 %else
  44     DECLARE_REG_TMP 0,4,2,1,3,5,6,2
  45     %define pointer resd
  46 %endif
  47
  48 struc cb
  49     .low: resd 1
  50     .range: resd 1
  51     .queue: resd 1
  52     .bytes_outstanding: resd 1
  53     .start: pointer 1
  54     .p: pointer 1
  55     .end: pointer 1
  56     align 16, resb 1
  57     .bits_encoded: resd 1
  58     .state: resb 1024
  59 endstruc
  60
  61 %macro LOAD_GLOBAL 4
  62 %ifdef PIC
  63     ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
  64     lea   r7, [%2]
  65     %ifnidn %3, 0
  66     add   r7, %3
  67     %endif
  68     movzx %1, byte [r7+%4]
  69 %else
  70     movzx %1, byte [%2+%3+%4]
  71 %endif
  72 %endmacro
  73
  74 cglobal cabac_encode_decision_asm, 0,7
  75     movifnidn t0,  r0mp
  76     movifnidn t1d, r1m
  77     mov   t5d, [t0+cb.range]
  78     movzx t6d, byte [t0+cb.state+t1]
  79     mov   t4d, ~1
  80     mov   t3d, t5d
  81     and   t4d, t6d
  82     shr   t5d, 6
  83     movifnidn t2d, r2m
  84 %ifdef WIN64
  85     PUSH r7
  86 %endif
  87     LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
  88     LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
  89     and   t6d, 1
  90     sub   t3d, t5d
  91     cmp   t6d, t2d
  92     mov   t6d, [t0+cb.low]
  93     lea    t2, [t6+t3]
  94     cmovne t3d, t5d
  95     cmovne t6d, t2d
  96     mov   [t0+cb.state+t1], t4b
  97 ;cabac_encode_renorm
  98     mov   t4d, t3d
  99     shr   t3d, 3
 100     LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
 101 %ifdef WIN64
 102     POP r7
 103 %endif
 104     shl   t4d, t3b
 105     shl   t6d, t3b
 106     mov   [t0+cb.range], t4d
 107     add   t3d, [t0+cb.queue]
 108     jge cabac_putbyte
 109 .update_queue_low:
 110     mov   [t0+cb.low], t6d
 111     mov   [t0+cb.queue], t3d
 112     RET
 113
 114 cglobal cabac_encode_bypass_asm, 0,3
 115     movifnidn  t0, r0mp
 116     movifnidn t3d, r1m
 117     mov       t7d, [t0+cb.low]
 118     and       t3d, [t0+cb.range]
 119     lea       t7d, [t7*2+t3]
 120     mov       t3d, [t0+cb.queue]
 121     inc       t3d
 122 %ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
 123     jge cabac_putbyte
 124 %else
 125     jge .putbyte
 126 %endif
 127     mov   [t0+cb.low], t7d
 128     mov   [t0+cb.queue], t3d
 129     RET
 130 .putbyte:
 131     PROLOGUE 0,7
 132     movifnidn t6d, t7d
 133     jmp cabac_putbyte
 134
 135 cglobal cabac_encode_terminal_asm, 0,3
 136     movifnidn  t0, r0mp
 137     sub  dword [t0+cb.range], 2
 138 ; shortcut: the renormalization shift in terminal
 139 ; can only be 0 or 1 and is zero over 99% of the time.
 140     test dword [t0+cb.range], 0x100
 141     je .renorm
 142     REP_RET
 143 .renorm:
 144     shl  dword [t0+cb.low], 1
 145     shl  dword [t0+cb.range], 1
 146     inc  dword [t0+cb.queue]
 147     jge .putbyte
 148     REP_RET
 149 .putbyte:
 150     PROLOGUE 0,7
 151     mov t3d, [t0+cb.queue]
 152     mov t6d, [t0+cb.low]
 153
 154 cabac_putbyte:
 155     ; alive: t0=cb t3=queue t6=low
 156 %ifdef WIN64
 157     DECLARE_REG_TMP 3,6,1,0,2,5,4
 158 %endif
 159     mov   t1d, -1
 160     add   t3d, 10
 161     mov   t2d, t6d
 162     shl   t1d, t3b
 163     shr   t2d, t3b ; out
 164     not   t1d
 165     sub   t3d, 18
 166     and   t6d, t1d
 167     mov   t5d, [t0+cb.bytes_outstanding]
 168     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
 169     jz    .postpone
 170     mov    t1, [t0+cb.p]
 171     add   [t1-1], dh ; t2h
 172     dec   dh
 173 .loop_outstanding:
 174     mov   [t1], dh
 175     inc   t1
 176     dec   t5d
 177     jge .loop_outstanding
 178     mov   [t1-1], t2b
 179     mov   [t0+cb.p], t1
 180 .postpone:
 181     inc   t5d
 182     mov   [t0+cb.bytes_outstanding], t5d
 183     jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)