MMX version of high bit depth plane_copy

[x264] / common / x86 / cabac-a.asm
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index 183164f0b9af01caa60c95554eeb990e60b95820..c70c671c9d7fbb08dc85006a1974664ac7856818 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -1,9 +1,11 @@
  ;*****************************************************************************
-;* cabac-a.asm: h264 encoder library
+;* cabac-a.asm: x86 cabac
  ;*****************************************************************************
-;* Copyright (C) 2008 x264 project
+;* Copyright (C) 2008-2010 x264 project
  ;*
-;* Author: Loren Merritt <lorenm@u.washington.edu>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Holger Lubitz <holger@lubitz.org>
  ;*
  ;* This program is free software; you can redistribute it and/or modify
  ;* it under the terms of the GNU General Public License as published by
@@ -17,34 +19,29 @@
  ;*
  ;* You should have received a copy of the GNU General Public License
  ;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
  ;*****************************************************************************
  
  %include "x86inc.asm"
  
-SECTION_RODATA
-
  SECTION .text
  
-cextern x264_cabac_range_lps
-cextern x264_cabac_transition
-cextern x264_cabac_renorm_shift
-
-%macro DEF_TMP 16
-    %rep 8
-        %define t%1d r%9d
-        %define t%1b r%9b
-        %define t%1  r%9
-        %rotate 1
-    %endrep
-%endmacro
+cextern cabac_range_lps
+cextern cabac_transition
+cextern cabac_renorm_shift
  
  ; t3 must be ecx, since it's used for shift.
-%ifdef ARCH_X86_64
-    DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
+%ifdef WIN64
+    DECLARE_REG_TMP 3,1,2,0,4,5,6,10,2
+    %define pointer resq
+%elifdef ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,10,6
      %define pointer resq
  %else
-    DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
+    DECLARE_REG_TMP 0,4,2,1,3,5,6,2,2
      %define pointer resd
  %endif
  
@@ -62,103 +59,121 @@ struc cb
  endstruc
  
  %macro LOAD_GLOBAL 4
-%ifdef PIC64
+%ifdef PIC
      ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
-    lea   r11, [%2 GLOBAL]
+    lea   r11, [%2]
      %ifnidn %3, 0
      add   r11, %3
      %endif
      movzx %1, byte [r11+%4]
-%elifdef PIC32
-    %ifnidn %3, 0
-    lea   %1, [%3+%4]
-    movzx %1, byte [%2+%1 GLOBAL]
-    %else
-    movzx %1, byte [%2+%3+%4 GLOBAL]
-    %endif
  %else
      movzx %1, byte [%2+%3+%4]
  %endif
  %endmacro
  
-cglobal x264_cabac_encode_decision_asm, 0,7
-    movifnidn t0d, r0m
+cglobal cabac_encode_decision_asm, 0,7
+    movifnidn t0,  r0mp
      movifnidn t1d, r1m
-    picgetgot t2
-    mov   t5d, [r0+cb.range]
-    movzx t3d, byte [r0+cb.state+t1]
-    mov   t4d, t5d
+    mov   t5d, [t0+cb.range]
+    movzx t4d, byte [t0+cb.state+t1]
+    mov   t3d, t5d
+    mov   t6d, t4d
      shr   t5d, 6
-    and   t5d, 3
-    LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
-    sub   t4d, t5d
-    mov   t6d, t3d
-    shr   t6d, 6
-%ifdef PIC32
-    cmp   t6d, r2m
-%else
+    shr   t4d, 1
      movifnidn t2d, r2m
+    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*4
+    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
+    and   t6d, 1
+    sub   t3d, t5d
      cmp   t6d, t2d
-%endif
-    mov   t6d, [r0+cb.low]
-    lea   t7,  [t6+t4]
-    cmovne t4d, t5d
+    mov   t6d, [t0+cb.low]
+    lea   t7,  [t6+t3]
+    cmovne t3d, t5d
      cmovne t6d, t7d
-%ifdef PIC32
-    mov   t1,  r2m
-    LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
-%else
-    LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
-%endif
-    movifnidn t1d, r1m
-    mov   [r0+cb.state+t1], t3b
-.renorm:
-    mov   t3d, t4d
+    mov   [t0+cb.state+t1], t4b
+;cabac_encode_renorm
+    mov   t4d, t3d
      shr   t3d, 3
-    LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+    LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
      shl   t4d, t3b
      shl   t6d, t3b
-    add   t3d, [r0+cb.queue]
-    mov   [r0+cb.range], t4d
-    mov   [r0+cb.low], t6d
-    mov   [r0+cb.queue], t3d
-    cmp   t3d, 8
+    add   t3d, [t0+cb.queue]
+    mov   [t0+cb.range], t4d
+    jge cabac_putbyte
+.update_queue_low:
+    mov   [t0+cb.low], t6d
+    mov   [t0+cb.queue], t3d
+    RET
+
+cglobal cabac_encode_bypass_asm, 0,3
+    movifnidn  t0, r0mp
+    movifnidn t3d, r1m
+    neg       t3d
+    mov       t8d, [t0+cb.low]
+    and       t3d, [t0+cb.range]
+    lea       t8d, [t8*2+t3]
+    mov       t3d, [t0+cb.queue]
+    inc       t3d
+%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
+    jge cabac_putbyte
+%else
+    jge .putbyte
+%endif
+    mov   [t0+cb.low], t8d
+    mov   [t0+cb.queue], t3d
+    RET
+.putbyte:
+    PROLOGUE 0,7
+    movifnidn t6d, t8d
+    jmp cabac_putbyte
+
+cglobal cabac_encode_terminal_asm, 0,3
+    movifnidn  t0, r0mp
+    sub  dword [t0+cb.range], 2
+; shortcut: the renormalization shift in terminal
+; can only be 0 or 1 and is zero over 99% of the time.
+    test dword [t0+cb.range], 0x100
+    je .renorm
+    REP_RET
+.renorm:
+    shl  dword [t0+cb.low], 1
+    shl  dword [t0+cb.range], 1
+    inc  dword [t0+cb.queue]
      jge .putbyte
      REP_RET
  .putbyte:
+    PROLOGUE 0,7
+    mov t3d, [t0+cb.queue]
+    mov t6d, [t0+cb.low]
+    jmp cabac_putbyte
+
+cabac_putbyte:
      ; alive: t0=cb t3=queue t6=low
-    add   t3d, 2
-    mov   t1d, 1
+%ifdef WIN64
+    DECLARE_REG_TMP 3,4,1,0,2,5,6,10
+%endif
+    mov   t1d, -1
+    add   t3d, 10
      mov   t2d, t6d
      shl   t1d, t3b
      shr   t2d, t3b ; out
-    dec   t1d
-    sub   t3d, 10
+    not   t1d
+    sub   t3d, 18
      and   t6d, t1d
+    mov   t5d, [t0+cb.bytes_outstanding]
      cmp   t2b, 0xff ; FIXME is a 32bit op faster?
-    mov   [r0+cb.queue], t3d
-    mov   [r0+cb.low], t6d
-    mov   t1d, t2d
-    mov   t4,  [r0+cb.p]
-    je .postpone
-    mov   t5d, [r0+cb.bytes_outstanding]
-    shr   t1d, 8 ; carry
-    add   [t4-1], t1b
-    test  t5d, t5d
-    jz .no_outstanding
-    dec   t1d
+    jz    .postpone
+    mov   t1,  [t0+cb.p]
+    add   [t1-1], dh ; t2h
+    dec   dh
  .loop_outstanding:
-    mov   [t4], t1b
-    inc   t4
+    mov   [t1], dh
+    inc   t1
      dec   t5d
-    jg .loop_outstanding
-.no_outstanding:
-    mov   [t4], t2b
-    inc   t4
-    mov   [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
-    mov   [r0+cb.p], t4
-    RET
+    jge .loop_outstanding
+    mov   [t1-1], t2b
+    mov   [t0+cb.p], t1
  .postpone:
-    inc   dword [r0+cb.bytes_outstanding]
-    RET
-
+    inc   t5d
+    mov   [t0+cb.bytes_outstanding], t5d
+    jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)