git.sesse.net Git - x264/blob - common/x86/cabac-a.asm

   1 ;*****************************************************************************
   2 ;* cabac-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2008 x264 project
   5 ;*
   6 ;* Author: Loren Merritt <lorenm@u.washington.edu>
   7 ;*
   8 ;* This program is free software; you can redistribute it and/or modify
   9 ;* it under the terms of the GNU General Public License as published by
  10 ;* the Free Software Foundation; either version 2 of the License, or
  11 ;* (at your option) any later version.
  12 ;*
  13 ;* This program is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 ;* GNU General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU General Public License
  19 ;* along with this program; if not, write to the Free Software
  20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  21 ;*****************************************************************************
  22
  23 %include "x86inc.asm"
  24
  25 SECTION_RODATA
  26
  27 SECTION .text
  28
  29 cextern x264_cabac_range_lps
  30 cextern x264_cabac_transition
  31 cextern x264_cabac_renorm_shift
  32
  33 %macro DEF_TMP 16
  34     %rep 8
  35         %define t%1d r%9d
  36         %define t%1b r%9b
  37         %define t%1  r%9
  38         %rotate 1
  39     %endrep
  40 %endmacro
  41
  42 ; t3 must be ecx, since it's used for shift.
  43 %ifdef ARCH_X86_64
  44     DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
  45     %define pointer resq
  46 %else
  47     DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
  48     %define pointer resd
  49 %endif
  50
  51 struc cb
  52     .low: resd 1
  53     .range: resd 1
  54     .queue: resd 1
  55     .bytes_outstanding: resd 1
  56     .start: pointer 1
  57     .p: pointer 1
  58     .end: pointer 1
  59     align 16, resb 1
  60     .bits_encoded: resd 1
  61     .state: resb 460
  62 endstruc
  63
  64 %macro LOAD_GLOBAL 4
  65 %ifdef PIC64
  66     ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
  67     lea   r11, [%2 GLOBAL]
  68     %ifnidn %3, 0
  69     add   r11, %3
  70     %endif
  71     movzx %1, byte [r11+%4]
  72 %elifdef PIC32
  73     %ifnidn %3, 0
  74     lea   %1, [%3+%4]
  75     movzx %1, byte [%2+%1 GLOBAL]
  76     %else
  77     movzx %1, byte [%2+%3+%4 GLOBAL]
  78     %endif
  79 %else
  80     movzx %1, byte [%2+%3+%4]
  81 %endif
  82 %endmacro
  83
  84 cglobal x264_cabac_encode_decision, 0,7
  85     movifnidn t0d, r0m
  86     movifnidn t1d, r1m
  87     picgetgot t2
  88     mov   t5d, [r0+cb.range]
  89     movzx t3d, byte [r0+cb.state+t1]
  90     mov   t4d, t5d
  91     shr   t5d, 6
  92     and   t5d, 3
  93     LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
  94     sub   t4d, t5d
  95     mov   t6d, t3d
  96     shr   t6d, 6
  97 %ifdef PIC32
  98     cmp   t6d, r2m
  99 %else
 100     movifnidn t2d, r2m
 101     cmp   t6d, t2d
 102 %endif
 103     mov   t6d, [r0+cb.low]
 104     lea   t7,  [t6+t4]
 105     cmovne t4d, t5d
 106     cmovne t6d, t7d
 107 %ifdef PIC32
 108     mov   t1,  r2m
 109     LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
 110 %else
 111     LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
 112 %endif
 113     movifnidn t1d, r1m
 114     mov   [r0+cb.state+t1], t3b
 115 .renorm:
 116     mov   t3d, t4d
 117     shr   t3d, 3
 118     LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
 119     shl   t4d, t3b
 120     shl   t6d, t3b
 121     add   t3d, [r0+cb.queue]
 122     mov   [r0+cb.range], t4d
 123     mov   [r0+cb.low], t6d
 124     mov   [r0+cb.queue], t3d
 125     cmp   t3d, 8
 126     jge .putbyte
 127 .ret:
 128     REP_RET
 129 .putbyte:
 130     ; alive: t0=cb t3=queue t6=low
 131     add   t3d, 2
 132     mov   t1d, 1
 133     mov   t2d, t6d
 134     shl   t1d, t3b
 135     shr   t2d, t3b ; out
 136     dec   t1d
 137     sub   t3d, 10
 138     and   t6d, t1d
 139     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
 140     mov   [r0+cb.queue], t3d
 141     mov   [r0+cb.low], t6d
 142     mov   t1d, t2d
 143     mov   t4,  [r0+cb.p]
 144     je .postpone
 145     mov   t5d, [r0+cb.bytes_outstanding]
 146     shr   t1d, 8 ; carry
 147     lea   t6, [t4+t5+1]
 148     cmp   t6, [r0+cb.end]
 149     jge .ret
 150     add   [t4-1], t1b
 151     test  t5d, t5d
 152     jz .no_outstanding
 153     dec   t1d
 154 .loop_outstanding:
 155     mov   [t4], t1b
 156     inc   t4
 157     dec   t5d
 158     jg .loop_outstanding
 159 .no_outstanding:
 160     mov   [t4], t2b
 161     inc   t4
 162     mov   [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
 163     mov   [r0+cb.p], t4
 164     RET
 165 .postpone:
 166     inc   dword [r0+cb.bytes_outstanding]
 167     RET
 168