Deduplicate asm constants, automate name prefixing

author Fiona Glaser <fiona@x264.com>

Fri, 30 Apr 2010 18:36:19 +0000 (11:36 -0700)

committer Fiona Glaser <fiona@x264.com>

Thu, 6 May 2010 05:08:14 +0000 (22:08 -0700)
author Fiona Glaser <fiona@x264.com>
Fri, 30 Apr 2010 18:36:19 +0000 (11:36 -0700)
committer Fiona Glaser <fiona@x264.com>
Thu, 6 May 2010 05:08:14 +0000 (22:08 -0700)
diff --git a/Makefile b/Makefile

index fc5d7d40797a31a5a52cb9d904052a2c4d2b1a79..0b43a3e006bd3dca842fa8bc36fbb5cea09ff948 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -50,8 +50,8 @@ endif
  
  # MMX/SSE optims
  ifneq ($(AS),)
-X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
-          pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
+X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
+          mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
            cpu-a.asm dct-32.asm
  X86SRC = $(X86SRC0:%=common/x86/%)
  
diff --git a/common/common.c b/common/common.c

index 7bc6d6a2f309f732e22416159f1aa453ad21fbed..848c6deb31dfd09a05f339995f5af512542d00e4 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -22,7 +22,6 @@
   *****************************************************************************/
  
  #include "common.h"
-#include "cpu.h"
  
  #include <stdarg.h>
  #include <ctype.h>
diff --git a/common/common.h b/common/common.h

index 2f35244d0e8f5a75c75ff60a89a8273959914c91..91d50301245b1180d6c25412497b3aa80769c726 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -110,6 +110,7 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
  #include "dct.h"
  #include "cabac.h"
  #include "quant.h"
+#include "cpu.h"
  
  /****************************************************************************
   * General functions
diff --git a/common/cpu.c b/common/cpu.c

index db2d4578d66e96217ddcae0e9eca5d2b6e1773a3..d13e76630d8c718485e27a9bdcaaf74678361c77 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -324,13 +324,6 @@ uint32_t x264_cpu_detect( void )
  
  #endif
  
-#ifndef HAVE_MMX
-void x264_emms( void )
-{
-}
-#endif
-
-
  int x264_cpu_num_processors( void )
  {
  #if !defined(HAVE_PTHREAD)
diff --git a/common/cpu.h b/common/cpu.h

index 6901e1e18c99f8be9e6177b2b147c040a649e65a..3b61f0516ba9f578a510912cd32fa0511416b68e 100644 (file)
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -23,7 +23,14 @@
  
  uint32_t x264_cpu_detect( void );
  int      x264_cpu_num_processors( void );
-void     x264_emms( void );
+void     x264_cpu_emms( void );
+void     x264_cpu_sfence( void );
+#ifdef HAVE_MMX
+#define x264_emms() x264_cpu_emms()
+#else
+#define x264_emms()
+#endif
+#define x264_sfence x264_cpu_sfence
  void     x264_cpu_mask_misalign_sse( void );
  
  /* kluge:
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index a04221c7d07a48b885559ec6c0e3074ca3f906aa..9708f2a93a442579e41fe36f55f80b6490576a60 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -24,13 +24,11 @@
  
  %include "x86inc.asm"
  
-SECTION_RODATA
-
  SECTION .text
  
-cextern x264_cabac_range_lps
-cextern x264_cabac_transition
-cextern x264_cabac_renorm_shift
+cextern cabac_range_lps
+cextern cabac_transition
+cextern cabac_renorm_shift
  
  ; t3 must be ecx, since it's used for shift.
  %ifdef WIN64
@@ -70,7 +68,7 @@ endstruc
  %endif
  %endmacro
  
-cglobal x264_cabac_encode_decision_asm, 0,7
+cglobal cabac_encode_decision_asm, 0,7
      movifnidn t0,  r0mp
      movifnidn t1d, r1m
      mov   t5d, [t0+cb.range]
@@ -78,8 +76,8 @@ cglobal x264_cabac_encode_decision_asm, 0,7
      mov   t3d, t5d
      shr   t5d, 6
      movifnidn t2d, r2m
-    LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4
-    LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2
+    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t6*4
+    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
      shr   t6d, 6
      sub   t3d, t5d
      cmp   t6d, t2d
@@ -88,17 +86,17 @@ cglobal x264_cabac_encode_decision_asm, 0,7
      cmovne t3d, t5d
      cmovne t6d, t7d
      mov   [t0+cb.state+t1], t4b
-;x264_cabac_encode_renorm
+;cabac_encode_renorm
      mov   t4d, t3d
      shr   t3d, 3
-    LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+    LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
      shl   t4d, t3b
      shl   t6d, t3b
      add   t3d, [t0+cb.queue]
      mov   [t0+cb.range], t4d
      cmp   t3d, 8
      jl .update_queue_low
-;x264_cabac_putbyte
+;cabac_putbyte
      ; alive: t0=cb t3=queue t6=low
  %ifdef WIN64
      DECLARE_REG_TMP 3,4,1,0,2,5,6,10
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm

new file mode 100755 (executable)

index 0000000..57a0ae9
--- /dev/null
+++ b/common/x86/const-a.asm
@@ -0,0 +1,54 @@
+;*****************************************************************************
+;* const-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2010 x264 project
+;*
+;* Author: Loren Merritt <lorenm@u.washington.edu>
+;*         Fiona Glaser <fiona@x264.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+const pb_01,       times  8 db 0,1
+const pb_0,        times 16 db 0
+const pb_a1,       times 16 db 0xa1
+const pb_1,        times 16 db 1
+const pb_3,        times 16 db 3
+const hsub_mul,    times  8 db 1, -1
+const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+
+const pw_1,        times 8 dw 1
+const pw_2,        times 8 dw 2
+const pw_4,        times 8 dw 4
+const pw_8,        times 8 dw 8
+const pw_16,       times 8 dw 16
+const pw_32,       times 8 dw 32
+const pw_64,       times 8 dw 64
+const pw_32_0,     times 4 dw 32,
+                   times 4 dw 0
+const pw_8000,     times 8 dw 0x8000
+const pw_3fff,   times 8 dw 0x3fff
+
+const pd_1,        times 4 dd 1
+const pd_128,      times 4 dd 128
+const pw_00ff,     times 8 dw 0x00ff
+const pw_ff00,     times 8 dw 0xff00
+
+const pb_reverse,  db 7, 6, 5, 4, 3, 2, 1, 0
+const sw_64,       dd 64
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm

index 285111a9c91417d011ee83e99a6ab290c82f91fe..b4b211b5cc28d609ce18fc62488b65551b73fb70 100644 (file)
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -29,9 +29,9 @@ SECTION .text
  %ifdef ARCH_X86_64
  
  ;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
  ;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid, 5,7
+cglobal cpu_cpuid, 5,7
      push    rbx
      mov     r11,   r1
      mov     r10,   r2
@@ -49,10 +49,10 @@ cglobal x264_cpu_cpuid, 5,7
  %else
  
  ;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid_test( void )
+; int cpu_cpuid_test( void )
  ; return 0 if unsupported
  ;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid_test
+cglobal cpu_cpuid_test
      pushfd
      push    ebx
      push    ebp
@@ -75,9 +75,9 @@ cglobal x264_cpu_cpuid_test
      ret
  
  ;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+; int cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
  ;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid, 0,6
+cglobal cpu_cpuid, 0,6
      mov     eax,    r0m
      cpuid
      mov     esi,    r1m
@@ -91,9 +91,9 @@ cglobal x264_cpu_cpuid, 0,6
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_stack_align( void (*func)(void*), void *arg );
+; void stack_align( void (*func)(void*), void *arg );
  ;-----------------------------------------------------------------------------
-cglobal x264_stack_align
+cglobal stack_align
      push ebp
      mov  ebp, esp
      sub  esp, 8
@@ -110,16 +110,23 @@ cglobal x264_stack_align
  %endif
  
  ;-----------------------------------------------------------------------------
-; void x264_emms( void )
+; void cpu_emms( void )
  ;-----------------------------------------------------------------------------
-cglobal x264_emms
+cglobal cpu_emms
      emms
      ret
  
  ;-----------------------------------------------------------------------------
-; void x264_cpu_mask_misalign_sse(void)
+; void cpu_sfence( void )
  ;-----------------------------------------------------------------------------
-cglobal x264_cpu_mask_misalign_sse
+cglobal cpu_sfence
+    sfence
+    ret
+
+;-----------------------------------------------------------------------------
+; void cpu_mask_misalign_sse( void )
+;-----------------------------------------------------------------------------
+cglobal cpu_mask_misalign_sse
      sub   rsp, 4
      stmxcsr [rsp]
      or dword [rsp], 1<<17
diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm

index 3350e40b4dd47eb9d29c2fb28e7b2db09c20b499..14d10c56caa0c9bd10ec5f3a6a62cfe64b1158d2 100644 (file)
--- a/common/x86/dct-32.asm
+++ b/common/x86/dct-32.asm
@@ -27,13 +27,11 @@
  %include "x86inc.asm"
  %include "x86util.asm"
  
-SECTION_RODATA
-
-pw_32: times 8 dw 32
-hsub_mul: times 8 db 1, -1
-
  SECTION .text
  
+cextern pw_32
+cextern hsub_mul
+
  ; in: m0..m7
  ; out: 0,4,6 in mem, rest in regs
  %macro DCT8_1D 9
@@ -188,10 +186,10 @@ dct8_mmx:
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
  ;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_mmx, 3,3
-global x264_sub8x8_dct8_mmx.skip_prologue
+cglobal sub8x8_dct8_mmx, 3,3
+global sub8x8_dct8_mmx.skip_prologue
  .skip_prologue:
      INIT_MMX
      call load_diff_4x8_mmx
@@ -254,10 +252,10 @@ idct8_mmx:
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_mmx, 2,2
-global x264_add8x8_idct8_mmx.skip_prologue
+cglobal add8x8_idct8_mmx, 2,2
+global add8x8_idct8_mmx.skip_prologue
  .skip_prologue:
      INIT_MMX
      add word [r1], 32
@@ -344,9 +342,9 @@ global x264_add8x8_idct8_mmx.skip_prologue
  
  INIT_XMM
  %macro DCT_SUB8 1
-cglobal x264_sub8x8_dct_%1, 3,3
+cglobal sub8x8_dct_%1, 3,3
      add r2, 4*FDEC_STRIDE
-global x264_sub8x8_dct_%1.skip_prologue
+global sub8x8_dct_%1.skip_prologue
  .skip_prologue:
  %ifnidn %1, sse2
      mova m7, [hsub_mul]
@@ -375,11 +373,11 @@ global x264_sub8x8_dct_%1.skip_prologue
      ret
  
  ;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
  ;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_%1, 3,3
+cglobal sub8x8_dct8_%1, 3,3
      add r2, 4*FDEC_STRIDE
-global x264_sub8x8_dct8_%1.skip_prologue
+global sub8x8_dct8_%1.skip_prologue
  .skip_prologue:
  %ifidn %1, sse2
      LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
@@ -419,11 +417,11 @@ DCT_SUB8 sse2
  DCT_SUB8 ssse3
  
  ;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct_sse2, 2,2
+cglobal add8x8_idct_sse2, 2,2
      add r0, 4*FDEC_STRIDE
-global x264_add8x8_idct_sse2.skip_prologue
+global add8x8_idct_sse2.skip_prologue
  .skip_prologue:
      UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
      SBUTTERFLY qdq, 0, 1, 4
@@ -456,11 +454,11 @@ global x264_add8x8_idct_sse2.skip_prologue
      ret
  
  ;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2
+cglobal add8x8_idct8_sse2, 2,2
      add r0, 4*FDEC_STRIDE
-global x264_add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_sse2.skip_prologue
  .skip_prologue:
      UNSPILL r1, 1,2,3,5,6,7
      IDCT8_1D   0,1,2,3,4,5,6,7,r1
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm

index ba7741eff8437206ae7a92cf504e56226ab48d8b..cae196373a70b6d0f3da8276a99635df5cc958fc 100644 (file)
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -26,11 +26,10 @@
  %include "x86inc.asm"
  %include "x86util.asm"
  
-SECTION_RODATA
-pw_32: times 8 dw 32
-hsub_mul: times 8 db 1, -1
-
  SECTION .text
+
+cextern pw_32
+cextern hsub_mul
  INIT_XMM
  
  %macro DCT8_1D 10
@@ -140,7 +139,7 @@ INIT_XMM
  %endmacro
  
  %macro DCT_SUB8 1
-cglobal x264_sub8x8_dct_%1, 3,3,11
+cglobal sub8x8_dct_%1, 3,3,11
      add r2, 4*FDEC_STRIDE
  %ifnidn %1, sse2
      mova m7, [hsub_mul]
@@ -149,7 +148,7 @@ cglobal x264_sub8x8_dct_%1, 3,3,11
      call .skip_prologue
      RET
  %endif
-global x264_sub8x8_dct_%1.skip_prologue
+global sub8x8_dct_%1.skip_prologue
  .skip_prologue:
      SWAP 7, 9
      LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
@@ -165,9 +164,9 @@ global x264_sub8x8_dct_%1.skip_prologue
      ret
  
  ;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
  ;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_%1, 3,3,11
+cglobal sub8x8_dct8_%1, 3,3,11
      add r2, 4*FDEC_STRIDE
  %ifnidn %1, sse2
      mova m7, [hsub_mul]
@@ -176,7 +175,7 @@ cglobal x264_sub8x8_dct8_%1, 3,3,11
      call .skip_prologue
      RET
  %endif
-global x264_sub8x8_dct8_%1.skip_prologue
+global sub8x8_dct8_%1.skip_prologue
  .skip_prologue:
      SWAP 7, 10
      LOAD_DIFF8x4  0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
@@ -205,16 +204,16 @@ DCT_SUB8 sse2
  DCT_SUB8 ssse3
  
  ;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2, 2,2,11
+cglobal add8x8_idct8_sse2, 2,2,11
      add r0, 4*FDEC_STRIDE
      pxor m7, m7
  %ifdef WIN64
      call .skip_prologue
      RET
  %endif
-global x264_add8x8_idct8_sse2.skip_prologue
+global add8x8_idct8_sse2.skip_prologue
  .skip_prologue:
      SWAP 7, 9
      movdqa  m0, [r1+0x00]
@@ -237,16 +236,16 @@ global x264_add8x8_idct8_sse2.skip_prologue
      ret
  
  ;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct_sse2, 2,2,11
+cglobal add8x8_idct_sse2, 2,2,11
      add  r0, 4*FDEC_STRIDE
      pxor m7, m7
  %ifdef WIN64
      call .skip_prologue
      RET
  %endif
-global x264_add8x8_idct_sse2.skip_prologue
+global add8x8_idct_sse2.skip_prologue
  .skip_prologue:
      SWAP 7, 9
      mova   m0, [r1+ 0]
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm

index 5dd51e5a6d3760ead12df22fa524f7dd64731993..2182f3fd77a0d8d0898ac28fa021c757478d0628 100644 (file)
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -35,12 +35,6 @@
  %endmacro
  
  SECTION_RODATA
-pw_32_0: times 4 dw 32
-         times 4 dw 0
-pw_32: times 8 dw 32
-pw_8000: times 8 dw 0x8000
-hsub_mul: times 8 db 1, -1
-
  pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
  pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
  pb_subacmask:   dw 0,-1,-1,-1,-1,-1,-1,-1
@@ -48,11 +42,16 @@ pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
  pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
  pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
  pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
  
  SECTION .text
  
+cextern pw_32_0
+cextern pw_32
+cextern pw_8000
+cextern hsub_mul
+cextern pb_1
+cextern pw_1
+
  %macro WALSH4_1D 5
      SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
      SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
@@ -73,9 +72,9 @@ SECTION .text
  
  INIT_MMX
  ;-----------------------------------------------------------------------------
-; void x264_dct4x4dc_mmx( int16_t d[4][4] )
+; void dct4x4dc( int16_t d[4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1
+cglobal dct4x4dc_mmx, 1,1
      movq   m3, [r0+24]
      movq   m2, [r0+16]
      movq   m1, [r0+ 8]
@@ -95,9 +94,9 @@ cglobal x264_dct4x4dc_mmx, 1,1
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_idct4x4dc_mmx( int16_t d[4][4] )
+; void idct4x4dc( int16_t d[4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx, 1,1
+cglobal idct4x4dc_mmx, 1,1
      movq   m3, [r0+24]
      movq   m2, [r0+16]
      movq   m1, [r0+ 8]
@@ -113,9 +112,9 @@ cglobal x264_idct4x4dc_mmx, 1,1
  
  %macro SUB_DCT4 1
  ;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
  ;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_%1, 3,3
+cglobal sub4x4_dct_%1, 3,3
  %ifidn %1, mmx
  .skip_prologue:
      LOAD_DIFF  m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
@@ -140,9 +139,9 @@ SUB_DCT4 mmx
  SUB_DCT4 ssse3
  
  ;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
+; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2
+cglobal add4x4_idct_mmx, 2,2
      pxor m7, m7
  .skip_prologue:
      movq  m1, [r1+ 8]
@@ -160,7 +159,7 @@ cglobal x264_add4x4_idct_mmx, 2,2
      RET
  
  INIT_XMM
-cglobal x264_add4x4_idct_sse4, 2,2,6
+cglobal add4x4_idct_sse4, 2,2,6
      mova      m0, [r1+0x00]     ; row1/row0
      mova      m2, [r1+0x10]     ; row3/row2
      mova      m1, m0            ; row1/row0
@@ -213,7 +212,7 @@ cglobal x264_add4x4_idct_sse4, 2,2,6
  
  INIT_MMX
  ;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
  ;-----------------------------------------------------------------------------
  %macro SUB_NxN_DCT 6
  cglobal %1, 3,3,11
@@ -249,7 +248,7 @@ cglobal %1, 3,3,11
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
  ;-----------------------------------------------------------------------------
  %macro ADD_NxN_IDCT 6-7
  cglobal %1, 2,2,11
@@ -280,33 +279,33 @@ cglobal %1, 2,2,11
  %endmacro
  
  %ifndef ARCH_X86_64
-SUB_NxN_DCT  x264_sub8x8_dct_mmx,    x264_sub4x4_dct_mmx.skip_prologue,  32, 4, 0, 0
-ADD_NxN_IDCT x264_add8x8_idct_mmx,   x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
-SUB_NxN_DCT  x264_sub16x16_dct_mmx,  x264_sub8x8_dct_mmx.skip_prologue,  32, 8, 4, 4
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
-
-cextern x264_sub8x8_dct8_mmx.skip_prologue
-cextern x264_add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT  x264_sub16x16_dct8_mmx,  x264_sub8x8_dct8_mmx.skip_prologue,  128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT  sub8x8_dct_mmx,    sub4x4_dct_mmx.skip_prologue,  32, 4, 0, 0
+ADD_NxN_IDCT add8x8_idct_mmx,   add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
+SUB_NxN_DCT  sub16x16_dct_mmx,  sub8x8_dct_mmx.skip_prologue,  32, 8, 4, 4
+ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
+
+cextern sub8x8_dct8_mmx.skip_prologue
+cextern add8x8_idct8_mmx.skip_prologue
+SUB_NxN_DCT  sub16x16_dct8_mmx,  sub8x8_dct8_mmx.skip_prologue,  128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
  %endif
  
  INIT_XMM
  
-cextern x264_sub8x8_dct_sse2.skip_prologue
-cextern x264_sub8x8_dct_ssse3.skip_prologue
-SUB_NxN_DCT  x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT  x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
-cextern x264_add8x8_idct_sse2.skip_prologue
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
+cextern sub8x8_dct_sse2.skip_prologue
+cextern sub8x8_dct_ssse3.skip_prologue
+SUB_NxN_DCT  sub16x16_dct_sse2,  sub8x8_dct_sse2.skip_prologue,  128, 8, 0, 0
+SUB_NxN_DCT  sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+cextern add8x8_idct_sse2.skip_prologue
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
  
-cextern x264_sub8x8_dct8_sse2.skip_prologue
-cextern x264_add8x8_idct8_sse2.skip_prologue
-SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2.skip_prologue,  128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern add8x8_idct8_sse2.skip_prologue
+SUB_NxN_DCT  sub16x16_dct8_sse2,  sub8x8_dct8_sse2.skip_prologue,  128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
  
-cextern x264_sub8x8_dct8_ssse3.skip_prologue
-SUB_NxN_DCT  x264_sub16x16_dct8_ssse3,  x264_sub8x8_dct8_ssse3.skip_prologue,  128, 8, 0, 0
+cextern sub8x8_dct8_ssse3.skip_prologue
+SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
  
  
  ;-----------------------------------------------------------------------------
@@ -331,7 +330,7 @@ SUB_NxN_DCT  x264_sub16x16_dct8_ssse3,  x264_sub8x8_dct8_ssse3.skip_prologue,  1
      movq      [%3+FDEC_STRIDE*3], %1
  %endmacro
  
-cglobal x264_add8x8_idct_dc_mmx, 2,2
+cglobal add8x8_idct_dc_mmx, 2,2
      movq      mm0, [r1]
      pxor      mm1, mm1
      add        r0, FDEC_STRIDE*4
@@ -350,7 +349,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
      ADD_DC    mm2, mm3, r0
      RET
  
-cglobal x264_add8x8_idct_dc_ssse3, 2,2
+cglobal add8x8_idct_dc_ssse3, 2,2
      movq      xmm0, [r1]
      pxor      xmm1, xmm1
      add         r0, FDEC_STRIDE*4
@@ -388,7 +387,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
      movhps    [r0+FDEC_STRIDE* 3], xmm5
      RET
  
-cglobal x264_add16x16_idct_dc_mmx, 2,3
+cglobal add16x16_idct_dc_mmx, 2,3
      mov       r2, 4
  .loop:
      movq      mm0, [r1]
@@ -431,7 +430,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
      movdqa    [r0+%1+FDEC_STRIDE*3], xmm7
  %endmacro
  
-cglobal x264_add16x16_idct_dc_sse2, 2,2,8
+cglobal add16x16_idct_dc_sse2, 2,2,8
      call .loop
      add       r0, FDEC_STRIDE*4
  %ifdef WIN64
@@ -465,7 +464,7 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2,8
      IDCT_DC_STORE 0, xmm2, xmm3
      ret
  
-cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
+cglobal add16x16_idct_dc_ssse3, 2,2,8
      call .loop
      add       r0, FDEC_STRIDE*4
  %ifdef WIN64
@@ -531,7 +530,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
  %endmacro
  
  INIT_MMX
-cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+cglobal sub8x8_dct_dc_mmxext, 3,3
      DCTDC_2ROW_MMX m0, m4, 0
      DCTDC_2ROW_MMX m5, m6, 2
      paddw     m0, m5
@@ -567,7 +566,7 @@ INIT_XMM
  %endif
  %endmacro
  
-cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+cglobal sub8x8_dct_dc_sse2, 3,3,8
      pxor     m7, m7
      DCTDC_2ROW_SSE2 0, 0, m4
      DCTDC_2ROW_SSE2 2, 1, m4
@@ -586,10 +585,10 @@ cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
  %macro SCAN_8x8 1
-cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
+cglobal zigzag_scan_8x8_frame_%1, 2,2,8
      movdqa    xmm0, [r1]
      movdqa    xmm1, [r1+16]
      movdq2q    mm0, xmm0
@@ -703,9 +702,9 @@ SCAN_8x8 sse2
  SCAN_8x8 ssse3
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
+cglobal zigzag_scan_8x8_frame_mmxext, 2,2
      movq       mm0, [r1]
      movq       mm1, [r1+2*8]
      movq       mm2, [r1+2*14]
@@ -798,9 +797,9 @@ cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
+cglobal zigzag_scan_4x4_frame_mmx, 2,2
      movq       mm0, [r1]
      movq       mm1, [r1+8]
      movq       mm2, [r1+16]
@@ -828,9 +827,9 @@ cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
+cglobal zigzag_scan_4x4_frame_ssse3, 2,2
      movdqa    xmm1, [r1+16]
      movdqa    xmm0, [r1]
      pshufb    xmm1, [pb_scan4frameb]
@@ -845,10 +844,10 @@ cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
  ;-----------------------------------------------------------------------------
  ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
+cglobal zigzag_scan_4x4_field_mmxext, 2,3
      pshufw     mm0, [r1+4], 0xd2
      movq       mm1, [r1+16]
      movq       mm2, [r1+24]
@@ -862,7 +861,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
  
  ; Output order:
@@ -875,7 +874,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
  ; 45 46 47 51 56 57 52 53
  ; 54 55 58 59 60 61 62 63
  
-cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+cglobal zigzag_scan_8x8_field_mmxext, 2,3
      movq       mm0, [r1+2*0]        ; 03 02 01 00
      movq       mm1, [r1+2*4]        ; 07 06 05 04
      movq       mm2, [r1+2*8]        ; 11 10 09 08
@@ -954,13 +953,13 @@ cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
  ;-----------------------------------------------------------------------------
  %macro ZIGZAG_SUB_4x4 2
  %ifidn %1, ac
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
  %else
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
  %endif
      movd      xmm0, [r1+0*FENC_STRIDE]
      movd      xmm1, [r1+1*FENC_STRIDE]
@@ -1020,7 +1019,7 @@ ZIGZAG_SUB_4x4   , field
  ZIGZAG_SUB_4x4 ac, field
  
  ;-----------------------------------------------------------------------------
-; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
  ;-----------------------------------------------------------------------------
  
  %macro INTERLEAVE 1
@@ -1047,7 +1046,7 @@ ZIGZAG_SUB_4x4 ac, field
  %endmacro
  
  INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
      INTERLEAVE  0
      INTERLEAVE  8
      INTERLEAVE 16
@@ -1095,7 +1094,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
  %endmacro
  
  INIT_XMM
-cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
      INTERLEAVE_XMM  0
      INTERLEAVE_XMM 16
      packsswb m2, m3
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm

index 00d041888ad1d698b9955b5ff94cb6763f57bd10..53b57f659a75048ef5c636d9f77196cc21ce7d91 100644 (file)
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -22,14 +22,13 @@
  
  %include "x86inc.asm"
  
-SECTION_RODATA
-pb_00: times 16 db 0x00
-pb_01: times 16 db 0x01
-pb_03: times 16 db 0x03
-pb_a1: times 16 db 0xa1
-
  SECTION .text
  
+cextern pb_0
+cextern pb_1
+cextern pb_3
+cextern pb_a1
+
  ; expands to [base],...,[base+7*stride]
  %define PASS8ROWS(base, base3, stride, stride3) \
      [base], [base+stride], [base+stride*2], [base3], \
@@ -234,11 +233,11 @@ SECTION .text
  %macro DEBLOCK_P0_Q0 0
      mova    m5, m1
      pxor    m5, m2       ; p0^q0
-    pand    m5, [pb_01]  ; (p0^q0)&1
+    pand    m5, [pb_1]   ; (p0^q0)&1
      pcmpeqb m4, m4
      pxor    m3, m4
      pavgb   m3, m0       ; (p1 - q1 + 256)>>1
-    pavgb   m3, [pb_03]  ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
      pxor    m4, m1
      pavgb   m4, m2       ; (q0 - p0 + 256)>>1
      pavgb   m3, m5
@@ -263,7 +262,7 @@ SECTION .text
      pavgb   %6, m2
      pavgb   %2, %6       ; avg(p2,avg(p0,q0))
      pxor    %6, %3
-    pand    %6, [pb_01]  ; (p2^avg(p0,q0))&1
+    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
      psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
      mova    %6, %1
      psubusb %6, %5
@@ -275,10 +274,10 @@ SECTION .text
  
  %ifdef ARCH_X86_64
  ;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  ;-----------------------------------------------------------------------------
  INIT_XMM
-cglobal x264_deblock_v_luma_sse2, 5,5,10
+cglobal deblock_v_luma_sse2, 5,5,10
      movd    m8, [r4] ; tc0
      lea     r4, [r1*3]
      dec     r2d        ; alpha-1
@@ -321,10 +320,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  ;-----------------------------------------------------------------------------
  INIT_MMX
-cglobal x264_deblock_h_luma_sse2, 5,7
+cglobal deblock_h_luma_sse2, 5,7
      movsxd r10, r1d
      lea    r11, [r10+r10*2]
      lea    r6,  [r0-4]
@@ -345,13 +344,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7
  
      ; vertical filter
      ; alpha, beta, tc0 are still in r2d, r3d, r4
-    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+    ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
      lea    r0, [pix_tmp+0x30]
      mov    r1d, 0x10
  %ifdef WIN64
      mov    [rsp+0x20], r4
  %endif
-    call   x264_deblock_v_luma_sse2
+    call   deblock_v_luma_sse2
  
      ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
      add    r6, 2
@@ -383,9 +382,9 @@ cglobal x264_deblock_h_luma_sse2, 5,7
  
  %macro DEBLOCK_LUMA 3
  ;-----------------------------------------------------------------------------
-; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5
+cglobal deblock_%2_luma_%1, 5,5
      lea     r4, [r1*3]
      dec     r2     ; alpha-1
      neg     r4
@@ -436,10 +435,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  ;-----------------------------------------------------------------------------
  INIT_MMX
-cglobal x264_deblock_h_luma_%1, 0,5
+cglobal deblock_h_luma_%1, 0,5
      mov    r0, r0mp
      mov    r3, r1m
      lea    r4, [r3*3]
@@ -462,11 +461,11 @@ cglobal x264_deblock_h_luma_%1, 0,5
      PUSH   dword r2m
      PUSH   dword 16
      PUSH   dword r0
-    call   x264_deblock_%2_luma_%1
+    call   deblock_%2_luma_%1
  %ifidn %2, v8
      add    dword [esp   ], 8 ; pix_tmp+0x38
      add    dword [esp+16], 2 ; tc0+2
-    call   x264_deblock_%2_luma_%1
+    call   deblock_%2_luma_%1
  %endif
      ADD    esp, 20
  
@@ -517,9 +516,9 @@ DEBLOCK_LUMA sse2, v, 16
      mova  t3, t2
      mova  t4, t2
      psrlw t2, 1
-    pavgb t2, mpb_00
+    pavgb t2, mpb_0
      pxor  t2, t0
-    pand  t2, mpb_01
+    pand  t2, mpb_1
      psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
  
      mova  t1, p2
@@ -528,21 +527,21 @@ DEBLOCK_LUMA sse2, v, 16
      psubb t2, q1
      paddb t3, t3
      psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
-    pand  t2, mpb_01
+    pand  t2, mpb_1
      psubb t1, t2
      pavgb t1, p1
      pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
      psrlw t3, 2
-    pavgb t3, mpb_00
+    pavgb t3, mpb_0
      pxor  t3, t1
-    pand  t3, mpb_01
+    pand  t3, mpb_1
      psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
  
      mova  t3, p0
      mova  t2, p0
      pxor  t3, q1
      pavgb t2, q1
-    pand  t3, mpb_01
+    pand  t3, mpb_1
      psubb t2, t3
      pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
  
@@ -562,9 +561,9 @@ DEBLOCK_LUMA sse2, v, 16
      paddb t2, t2
      paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
      psrlw t2, 2
-    pavgb t2, mpb_00
+    pavgb t2, mpb_0
      pxor  t2, t1
-    pand  t2, mpb_01
+    pand  t2, mpb_1
      psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
  
      pxor  t0, p1
@@ -603,8 +602,8 @@ DEBLOCK_LUMA sse2, v, 16
      %define mask0 m12
      %define mask1p m13
      %define mask1q [rsp-24]
-    %define mpb_00 m14
-    %define mpb_01 m15
+    %define mpb_0 m14
+    %define mpb_1 m15
  %else
      %define spill(x) [esp+16*x+((stack_offset+4)&15)]
      %define p2 [r4+r1]
@@ -614,14 +613,14 @@ DEBLOCK_LUMA sse2, v, 16
      %define mask0 spill(2)
      %define mask1p spill(3)
      %define mask1q spill(4)
-    %define mpb_00 [pb_00]
-    %define mpb_01 [pb_01]
+    %define mpb_0 [pb_0]
+    %define mpb_1 [pb_1]
  %endif
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%2_luma_intra_%1, 4,6,16
  %ifndef ARCH_X86_64
      sub     esp, 0x60
  %endif
@@ -638,12 +637,12 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
      mova    q0, [r0]
      mova    q1, [r0+r1]
  %ifdef ARCH_X86_64
-    pxor    mpb_00, mpb_00
-    mova    mpb_01, [pb_01]
+    pxor    mpb_0, mpb_0
+    mova    mpb_1, [pb_1]
      LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
      SWAP    7, 12 ; m12=mask0
-    pavgb   t5, mpb_00
-    pavgb   t5, mpb_01 ; alpha/4+1
+    pavgb   t5, mpb_0
+    pavgb   t5, mpb_1 ; alpha/4+1
      movdqa  p2, [r4+r1]
      movdqa  q2, [r0+2*r1]
      DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
@@ -658,8 +657,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
      LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
      mova    m4, t5
      mova    mask0, m7
-    pavgb   m4, [pb_00]
-    pavgb   m4, [pb_01] ; alpha/4+1
+    pavgb   m4, [pb_0]
+    pavgb   m4, [pb_1] ; alpha/4+1
      DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
      pand    m6, mask0
      DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
@@ -681,9 +680,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
  INIT_MMX
  %ifdef ARCH_X86_64
  ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra_%1, 4,7
      movsxd r10, r1d
      lea    r11, [r10*3]
      lea    r6,  [r0-4]
@@ -699,7 +698,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
  
      lea    r0,  [pix_tmp+0x40]
      mov    r1,  0x10
-    call   x264_deblock_v_luma_intra_%1
+    call   deblock_v_luma_intra_%1
  
      ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
      lea    r5, [r6+r11]
@@ -712,7 +711,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
      add    rsp, 0x88
      RET
  %else
-cglobal x264_deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra_%1, 2,4
      lea    r3,  [r1*3]
      sub    r0,  4
      lea    r2,  [r0+r3]
@@ -731,10 +730,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
      PUSH   dword r2m
      PUSH   dword 16
      PUSH   r0
-    call   x264_deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_%1
  %ifidn %2, v8
      add    dword [rsp], 8 ; pix_tmp+8
-    call   x264_deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_%1
  %endif
      ADD    esp, 16
  
@@ -785,9 +784,9 @@ INIT_MMX
  %define t6 r6
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_mmxext, 5,6
      CHROMA_V_START
      movq  m0, [t5]
      movq  m1, [t5+r1]
@@ -799,9 +798,9 @@ cglobal x264_deblock_v_chroma_mmxext, 5,6
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext, 5,7
+cglobal deblock_h_chroma_mmxext, 5,7
  %ifdef ARCH_X86_64
      %define buf0 [rsp-24]
      %define buf1 [rsp-16]
@@ -835,7 +834,7 @@ chroma_inter_body_mmxext:
  %macro CHROMA_INTRA_P0 3
      movq    m4, %1
      pxor    m4, %3
-    pand    m4, [pb_01] ; m4 = (p0^q1)&1
+    pand    m4, [pb_1] ; m4 = (p0^q1)&1
      pavgb   %1, %3
      psubusb %1, m4
      pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
@@ -845,9 +844,9 @@ chroma_inter_body_mmxext:
  %define t6 r5
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_mmxext, 4,5
      CHROMA_V_START
      movq  m0, [t5]
      movq  m1, [t5+r1]
@@ -859,9 +858,9 @@ cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
  ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_mmxext, 4,6
      CHROMA_H_START
      TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
      call chroma_intra_body_mmxext
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 5faf33ca12aff67e461f4d3c0b6cbcaf6cd8e847..2fb720b06aa211ae1bd1325abc0066b3f05c3076 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -29,15 +29,16 @@
  SECTION_RODATA 32
  
  ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
-pw_1:  times 8 dw  1
-pw_4:  times 8 dw  4
-pw_8:  times 8 dw  8
-pw_32: times 8 dw 32
-pw_64: times 8 dw 64
-sw_64: dd 64
  
  SECTION .text
  
+cextern pw_1
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+cextern sw_64
+
  ;=============================================================================
  ; implicit weighted biprediction
  ;=============================================================================
@@ -129,10 +130,10 @@ SECTION .text
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
+; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
  ;-----------------------------------------------------------------------------
  %macro AVG_WEIGHT 2-3 0
-cglobal x264_pixel_avg_weight_w%2_%1
+cglobal pixel_avg_weight_w%2_%1
      BIWEIGHT_START
      AVG_START %3
  %if %2==8 && mmsize==16
@@ -165,7 +166,7 @@ AVG_WEIGHT mmxext, 4
  AVG_WEIGHT mmxext, 8
  AVG_WEIGHT mmxext, 16
  INIT_XMM
-%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
+%define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext
  AVG_WEIGHT sse2, 8,  7
  AVG_WEIGHT sse2, 16, 7
  %define BIWEIGHT BIWEIGHT_SSSE3
@@ -293,8 +294,9 @@ AVG_WEIGHT ssse3, 16, 7
  %endrep
  %endmacro
  
-
-;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
+;-----------------------------------------------------------------------------
+;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
+;-----------------------------------------------------------------------------
  
  %ifdef ARCH_X86_64
  %define NUMREGS 6
@@ -307,7 +309,7 @@ AVG_WEIGHT ssse3, 16, 7
  %endif
  
  %macro WEIGHTER 2
-    cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+    cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
      WEIGHT_START %1
      LOAD_HEIGHT
  .loop:
@@ -363,9 +365,11 @@ WEIGHTER 20, ssse3
  %endrep
  %endmacro
  
-;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
+;-----------------------------------------------------------------------------
+;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
+;-----------------------------------------------------------------------------
  %macro OFFSET 3
-    cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+    cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
      mova m2, [r4]
      LOAD_HEIGHT
  .loop:
@@ -402,25 +406,25 @@ OFFSETPN 20, sse2
  ;=============================================================================
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
-;                                 uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( uint8_t *dst, int dst_stride,
+;                     uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
  ;-----------------------------------------------------------------------------
  %macro AVGH 3
-cglobal x264_pixel_avg_%1x%2_%3
+cglobal pixel_avg_%1x%2_%3
      mov eax, %2
      cmp dword r6m, 32
-    jne x264_pixel_avg_weight_w%1_%3
+    jne pixel_avg_weight_w%1_%3
  %if mmsize == 16 && %1 == 16
      test dword r4m, 15
-    jz x264_pixel_avg_w%1_sse2
+    jz pixel_avg_w%1_sse2
  %endif
-    jmp x264_pixel_avg_w%1_mmxext
+    jmp pixel_avg_w%1_mmxext
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-;                                uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
-;                                int height, int weight );
+; void pixel_avg_w4( uint8_t *dst, int dst_stride,
+;                    uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
+;                    int height, int weight );
  ;-----------------------------------------------------------------------------
  
  %macro AVG_END 0
@@ -445,17 +449,17 @@ cglobal %1
  %endmacro
  
  INIT_MMX
-AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
+AVG_FUNC pixel_avg_w4_mmxext, movd, movd
  AVGH 4, 8, mmxext
  AVGH 4, 4, mmxext
  AVGH 4, 2, mmxext
  
-AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
+AVG_FUNC pixel_avg_w8_mmxext, movq, movq
  AVGH 8, 16, mmxext
  AVGH 8, 8,  mmxext
  AVGH 8, 4,  mmxext
  
-cglobal x264_pixel_avg_w16_mmxext
+cglobal pixel_avg_w16_mmxext
      AVG_START
      movq   mm0, [t2  ]
      movq   mm1, [t2+8]
@@ -475,7 +479,7 @@ AVGH 16, 16, mmxext
  AVGH 16, 8,  mmxext
  
  INIT_XMM
-AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
+AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa
  AVGH 16, 16, sse2
  AVGH 16,  8, sse2
  AVGH  8, 16, sse2
@@ -498,12 +502,12 @@ AVGH  4,  2, ssse3
  ;=============================================================================
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
-;                                 uint8_t *src1, int src_stride,
-;                                 uint8_t *src2, int height );
+; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
+;                     uint8_t *src1, int src_stride,
+;                     uint8_t *src2, int height );
  ;-----------------------------------------------------------------------------
  %macro AVG2_W8 2
-cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmxext, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -524,7 +528,7 @@ AVG2_W8 4, movd
  AVG2_W8 8, movq
  
  %macro AVG2_W16 2
-cglobal x264_pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmxext, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -550,7 +554,7 @@ cglobal x264_pixel_avg2_w%1_mmxext, 6,7
  AVG2_W16 12, movd
  AVG2_W16 16, movq
  
-cglobal x264_pixel_avg2_w20_mmxext, 6,7
+cglobal pixel_avg2_w20_mmxext, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -578,7 +582,7 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7
      jg     .height_loop
      REP_RET
  
-cglobal x264_pixel_avg2_w16_sse2, 6,7
+cglobal pixel_avg2_w16_sse2, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -597,7 +601,7 @@ cglobal x264_pixel_avg2_w16_sse2, 6,7
      REP_RET
  
  %macro AVG2_W20 1
-cglobal x264_pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_%1, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -647,16 +651,16 @@ AVG2_W20 sse2_misalign
  %endmacro
  
  %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
-cglobal x264_pixel_avg2_w%1_cache%2_%3
+cglobal pixel_avg2_w%1_cache%2_%3
      mov    eax, r2m
      and    eax, 0x1f|(%2>>1)
      cmp    eax, (32-%1)|(%2>>1)
-    jle x264_pixel_avg2_w%1_%3
+    jle pixel_avg2_w%1_%3
  ;w12 isn't needed because w16 is just as fast if there's no cacheline split
  %if %1 == 12
-    jmp x264_pixel_avg2_w16_cache_mmxext
+    jmp pixel_avg2_w16_cache_mmxext
  %else
-    jmp x264_pixel_avg2_w%1_cache_mmxext
+    jmp pixel_avg2_w%1_cache_mmxext
  %endif
  %endmacro
  
@@ -687,7 +691,7 @@ cglobal x264_pixel_avg2_w%1_cache%2_%3
      %2 [r0+%1], mm0
  %endmacro
  
-x264_pixel_avg2_w8_cache_mmxext:
+pixel_avg2_w8_cache_mmxext:
      AVG_CACHELINE_START
      AVG_CACHELINE_LOOP 0, movq
      add    r2, r3
@@ -696,7 +700,7 @@ x264_pixel_avg2_w8_cache_mmxext:
      jg     .height_loop
      REP_RET
  
-x264_pixel_avg2_w16_cache_mmxext:
+pixel_avg2_w16_cache_mmxext:
      AVG_CACHELINE_START
      AVG_CACHELINE_LOOP 0, movq
      AVG_CACHELINE_LOOP 8, movq
@@ -706,7 +710,7 @@ x264_pixel_avg2_w16_cache_mmxext:
      jg .height_loop
      REP_RET
  
-x264_pixel_avg2_w20_cache_mmxext:
+pixel_avg2_w20_cache_mmxext:
      AVG_CACHELINE_START
      AVG_CACHELINE_LOOP 0, movq
      AVG_CACHELINE_LOOP 8, movq
@@ -754,11 +758,11 @@ avg_w16_align%1_%2_ssse3:
      rep ret
  %endmacro
  
-cglobal x264_pixel_avg2_w16_cache64_ssse3
+cglobal pixel_avg2_w16_cache64_ssse3
      mov    eax, r2m
      and    eax, 0x3f
      cmp    eax, 0x30
-    jle x264_pixel_avg2_w16_sse2
+    jle pixel_avg2_w16_sse2
      PROLOGUE 6,7
      lea    r6, [r4+r2]
      and    r4, ~0xf
@@ -807,10 +811,10 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
  
  INIT_MMX
  ;-----------------------------------------------------------------------------
-; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
-;                           uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
+;                  uint8_t *src, int i_src_stride, int i_height )
  ;-----------------------------------------------------------------------------
-cglobal x264_mc_copy_w4_mmx, 4,6
+cglobal mc_copy_w4_mmx, 4,6
      cmp     dword r4m, 4
      lea     r5, [r3*3]
      lea     r4, [r1*3]
@@ -822,7 +826,7 @@ cglobal x264_mc_copy_w4_mmx, 4,6
      COPY4 movd, movd, r4, r5
      RET
  
-cglobal x264_mc_copy_w8_mmx, 5,7
+cglobal mc_copy_w8_mmx, 5,7
      lea     r6, [r3*3]
      lea     r5, [r1*3]
  .height_loop:
@@ -833,7 +837,7 @@ cglobal x264_mc_copy_w8_mmx, 5,7
      jg      .height_loop
      REP_RET
  
-cglobal x264_mc_copy_w16_mmx, 5,7
+cglobal mc_copy_w16_mmx, 5,7
      lea     r6, [r3*3]
      lea     r5, [r1*3]
  .height_loop:
@@ -873,11 +877,11 @@ cglobal %1, 5,7
      REP_RET
  %endmacro
  
-COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
  ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
  ; but with SSE3 the overhead is zero, so there's no reason not to include it.
-COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
-COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
+COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
+COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
  
  
  
@@ -887,11 +891,11 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
  ; FIXME assumes 64 byte cachelines
  
  ;-----------------------------------------------------------------------------
-; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
-;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( uint8_t *pix_y, int stride_y,
+;                     uint8_t *pix_uv, int stride_uv, int mb_x )
  ;-----------------------------------------------------------------------------
  %ifdef ARCH_X86_64
-cglobal x264_prefetch_fenc_mmxext, 5,5
+cglobal prefetch_fenc_mmxext, 5,5
      mov    eax, r4d
      and    eax, 3
      imul   eax, r1d
@@ -910,7 +914,7 @@ cglobal x264_prefetch_fenc_mmxext, 5,5
      RET
  
  %else
-cglobal x264_prefetch_fenc_mmxext
+cglobal prefetch_fenc_mmxext
      mov    r2, [esp+20]
      mov    r1, [esp+8]
      mov    r0, [esp+4]
@@ -935,9 +939,9 @@ cglobal x264_prefetch_fenc_mmxext
  %endif ; ARCH_X86_64
  
  ;-----------------------------------------------------------------------------
-; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( uint8_t *pix, int stride, int parity )
  ;-----------------------------------------------------------------------------
-cglobal x264_prefetch_ref_mmxext, 3,3
+cglobal prefetch_ref_mmxext, 3,3
      dec    r2d
      and    r2d, r1d
      lea    r0,  [r0+r2*8+64]
@@ -982,16 +986,16 @@ cglobal x264_prefetch_ref_mmxext, 3,3
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
-;                             uint8_t *src, int src_stride,
-;                             int dx, int dy,
-;                             int width, int height )
+; void mc_chroma( uint8_t *dst, int dst_stride,
+;                 uint8_t *src, int src_stride,
+;                 int dx, int dy,
+;                 int width, int height )
  ;-----------------------------------------------------------------------------
  %macro MC_CHROMA 1-2 0
-cglobal x264_mc_chroma_%1
+cglobal mc_chroma_%1
  %if mmsize == 16
      cmp dword r6m, 4
-    jle x264_mc_chroma_mmxext
+    jle mc_chroma_mmxext
  %endif
      PROLOGUE 0,6,%2
      MC_CHROMA_START
@@ -1151,7 +1155,7 @@ MC_CHROMA sse2, 8
  
  %macro MC_CHROMA_SSSE3 2
  INIT_MMX
-cglobal x264_mc_chroma_ssse3%1, 0,6,%2
+cglobal mc_chroma_ssse3%1, 0,6,%2
      MC_CHROMA_START
      and       r4d, 7
      and       r5d, 7
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index d372761402d9c8edbec3164f22fe5941ccbeafaa..c4faf75a315b23896250e44bfd352374093dfed7 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -33,14 +33,14 @@ filt_mul15: times 8 db 1, -5
  filt_mul51: times 8 db -5, 1
  hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
  
-pw_1:  times 8 dw 1
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-pd_128: times 4 dd 128
-pw_0x3fff: times 4 dw 0x3fff
-
  SECTION .text
  
+cextern pw_1
+cextern pw_16
+cextern pw_32
+cextern pd_128
+cextern pw_3fff
+
  %macro LOAD_ADD 4
      movh       %4, %3
      movh       %1, %2
@@ -122,9 +122,9 @@ INIT_MMX
  
  %macro HPEL_V 1-2 0
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_v_%1, 5,6,%2
+cglobal hpel_filter_v_%1, 5,6,%2
  %ifdef WIN64
      movsxd   r4, r4d
  %endif
@@ -181,9 +181,9 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
  HPEL_V mmxext
  
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_mmxext, 3,3
+cglobal hpel_filter_c_mmxext, 3,3
      add r0, r2
      lea r1, [r1+r2*2]
      neg r2
@@ -210,9 +210,9 @@ cglobal x264_hpel_filter_c_mmxext, 3,3
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_mmxext, 3,3
+cglobal hpel_filter_h_mmxext, 3,3
      add r0, r2
      add r1, r2
      neg r2
@@ -257,9 +257,9 @@ INIT_XMM
  
  %macro HPEL_C 1
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_%1, 3,3,9
+cglobal hpel_filter_c_%1, 3,3,9
      add r0, r2
      lea r1, [r1+r2*2]
      neg r2
@@ -332,9 +332,9 @@ cglobal x264_hpel_filter_c_%1, 3,3,9
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_sse2, 3,3,8
+cglobal hpel_filter_h_sse2, 3,3,8
      add r0, r2
      add r1, r2
      neg r2
@@ -381,9 +381,9 @@ cglobal x264_hpel_filter_h_sse2, 3,3,8
  
  %ifndef ARCH_X86_64
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_ssse3, 3,3
+cglobal hpel_filter_h_ssse3, 3,3
      add r0, r2
      add r1, r2
      neg r2
@@ -558,10 +558,10 @@ HPEL_V ssse3
  
  %macro HPEL 1
  ;-----------------------------------------------------------------------------
-; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                             uint8_t *src, int stride, int width, int height)
+; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+;                   uint8_t *src, int stride, int width, int height)
  ;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_%1, 7,7,16
+cglobal hpel_filter_%1, 7,7,16
  %ifdef WIN64
      movsxd   r4, r4d
      movsxd   r5, r5d
@@ -627,20 +627,16 @@ HPEL sse2
  HPEL ssse3
  %endif
  
-cglobal x264_sfence
-    sfence
-    ret
-
  %undef movntq
  %undef movntps
  %undef sfence
  
  ;-----------------------------------------------------------------------------
-; void x264_plane_copy_core_mmxext( uint8_t *dst, int i_dst,
-;                                   uint8_t *src, int i_src, int w, int h)
+; void plane_copy_core( uint8_t *dst, int i_dst,
+;                       uint8_t *src, int i_src, int w, int h)
  ;-----------------------------------------------------------------------------
  ; assumes i_dst and w are multiples of 16, and i_dst>w
-cglobal x264_plane_copy_core_mmxext, 6,7
+cglobal plane_copy_core_mmxext, 6,7
      movsxdifnidn r1, r1d
      movsxdifnidn r3, r3d
      movsxdifnidn r4, r4d
@@ -699,9 +695,9 @@ cglobal x264_plane_copy_core_mmxext, 6,7
  ; memzero SSE will fail for non-mod128.
  
  ;-----------------------------------------------------------------------------
-; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
  ;-----------------------------------------------------------------------------
-cglobal x264_memcpy_aligned_mmx, 3,3
+cglobal memcpy_aligned_mmx, 3,3
      test r2d, 16
      jz .copy32
      sub r2d, 16
@@ -723,9 +719,9 @@ cglobal x264_memcpy_aligned_mmx, 3,3
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
  ;-----------------------------------------------------------------------------
-cglobal x264_memcpy_aligned_sse2, 3,3
+cglobal memcpy_aligned_sse2, 3,3
      test r2d, 16
      jz .copy32
      sub r2d, 16
@@ -753,10 +749,10 @@ cglobal x264_memcpy_aligned_sse2, 3,3
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void *x264_memzero_aligned( void *dst, size_t n );
+; void *memzero_aligned( void *dst, size_t n );
  ;-----------------------------------------------------------------------------
  %macro MEMZERO 1
-cglobal x264_memzero_aligned_%1, 2,2
+cglobal memzero_aligned_%1, 2,2
      add  r0, r1
      neg  r1
      pxor m0, m0
@@ -779,9 +775,9 @@ MEMZERO sse2
  
  
  ;-----------------------------------------------------------------------------
-; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
  ;-----------------------------------------------------------------------------
-cglobal x264_integral_init4h_sse4, 3,4
+cglobal integral_init4h_sse4, 3,4
      lea     r3, [r0+r2*2]
      add     r1, r2
      neg     r2
@@ -800,7 +796,7 @@ cglobal x264_integral_init4h_sse4, 3,4
      jl .loop
      REP_RET
  
-cglobal x264_integral_init8h_sse4, 3,4
+cglobal integral_init8h_sse4, 3,4
      lea     r3, [r0+r2*2]
      add     r1, r2
      neg     r2
@@ -827,9 +823,9 @@ cglobal x264_integral_init8h_sse4, 3,4
  
  %macro INTEGRAL_INIT_8V 1
  ;-----------------------------------------------------------------------------
-; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, int stride )
  ;-----------------------------------------------------------------------------
-cglobal x264_integral_init8v_%1, 3,3
+cglobal integral_init8v_%1, 3,3
      shl   r1, 1
      add   r0, r1
      lea   r2, [r0+r1*8]
@@ -852,10 +848,10 @@ INIT_XMM
  INTEGRAL_INIT_8V sse2
  
  ;-----------------------------------------------------------------------------
-; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
  ;-----------------------------------------------------------------------------
  INIT_MMX
-cglobal x264_integral_init4v_mmx, 3,5
+cglobal integral_init4v_mmx, 3,5
      shl   r2, 1
      lea   r3, [r0+r2*4]
      lea   r4, [r0+r2*8]
@@ -877,7 +873,7 @@ cglobal x264_integral_init4v_mmx, 3,5
      REP_RET
  
  INIT_XMM
-cglobal x264_integral_init4v_sse2, 3,5
+cglobal integral_init4v_sse2, 3,5
      shl     r2, 1
      add     r0, r2
      add     r1, r2
@@ -902,7 +898,7 @@ cglobal x264_integral_init4v_sse2, 3,5
      jl .loop
      REP_RET
  
-cglobal x264_integral_init4v_ssse3, 3,5
+cglobal integral_init4v_ssse3, 3,5
      shl     r2, 1
      add     r0, r2
      add     r1, r2
@@ -994,7 +990,7 @@ cglobal x264_integral_init4v_ssse3, 3,5
  ;                              int src_stride, int dst_stride, int width, int height )
  ;-----------------------------------------------------------------------------
  %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal x264_frame_init_lowres_core_%1, 6,7,%2
+cglobal frame_init_lowres_core_%1, 6,7,%2
  %ifdef WIN64
      movsxd   r5, r5d
  %endif
@@ -1115,7 +1111,7 @@ FRAME_INIT_LOWRES ssse3, 12
  ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  ;                             uint16_t *inter_costs, uint16_t *inv_qscales, int len )
  ;-----------------------------------------------------------------------------
-cglobal x264_mbtree_propagate_cost_sse2, 6,6
+cglobal mbtree_propagate_cost_sse2, 6,6
      shl r5d, 1
      lea r0, [r0+r5*2]
      add r1, r5
@@ -1135,7 +1131,7 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
      psrld     xmm0, 8       ; intra*invq>>8
      movq      xmm3, [r3+r5] ; inter
      movq      xmm1, [r1+r5] ; prop
-    pand      xmm3, [pw_0x3fff]
+    pand      xmm3, [pw_3fff]
      punpcklwd xmm1, xmm5
      punpcklwd xmm3, xmm5
      paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index a6cabd5513c101e52ecd99b7e2fa35155263aa66..35e25d8ce6097c3915b7c75750955e72bed079a1 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -44,11 +44,11 @@ DECL_SUF( x264_pixel_avg_4x4,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int
  DECL_SUF( x264_pixel_avg_4x2,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
  
  #define MC_WEIGHT(w,type) \
-    extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+    void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
  
  #define MC_WEIGHT_OFFSET(w,type) \
-    extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
-    extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
      MC_WEIGHT(w,type)
  
  MC_WEIGHT_OFFSET( 4, mmxext )
@@ -68,51 +68,51 @@ MC_WEIGHT( 20, ssse3 )
  #undef MC_OFFSET
  #undef MC_WEIGHT
  
-extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
-extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
-extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
-extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+void x264_prefetch_ref_mmxext( uint8_t *, int, int );
+void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
+                            uint8_t *dst, int i_dst_stride,
+                            int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
+                          uint8_t *dst, int i_dst_stride,
+                          int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
+                           uint8_t *dst, int i_dst_stride,
+                           int dx, int dy, int i_width, int i_height );
+void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
                                     uint8_t *dst, int i_dst_stride,
                                     int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
-                                 uint8_t *dst, int i_dst_stride,
-                                 int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
-                                  uint8_t *dst, int i_dst_stride,
-                                  int dx, int dy, int i_width, int i_height );
-extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
-                                  uint8_t *dst, int i_dst_stride,
-                                  int dx, int dy, int i_width, int i_height );
-extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
-extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
-extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
-extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
-extern void x264_memzero_aligned_mmx( void * dst, int n );
-extern void x264_memzero_aligned_sse2( void * dst, int n );
-extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
-extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
-extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
-extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                             uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
+void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
+void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
+void x264_memzero_aligned_mmx( void * dst, int n );
+void x264_memzero_aligned_sse2( void * dst, int n );
+void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
+void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, int len );
  #define LOWRES(cpu) \
-extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
-                                               int src_stride, int dst_stride, int width, int height );
+void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
+                                        int src_stride, int dst_stride, int width, int height );
  LOWRES(mmxext)
  LOWRES(cache32_mmxext)
  LOWRES(sse2)
  LOWRES(ssse3)
  
  #define PIXEL_AVG_W(width,cpu)\
-extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  /* This declares some functions that don't exist, but that isn't a problem. */
  #define PIXEL_AVG_WALL(cpu)\
  PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
@@ -309,7 +309,6 @@ GET_REF(cache64_ssse3)
  void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
  void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
  void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
-void x264_sfence( void );\
  static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
                               int stride, int width, int height, int16_t *buf )\
  {\
diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm

index 11b1ce0f1b2d22567775c3e2e9bc6e1eba7f79ca..4f66f6b5b1032c4b7b23f735030a25428938d374 100644 (file)
--- a/common/x86/pixel-32.asm
+++ b/common/x86/pixel-32.asm
@@ -61,9 +61,9 @@ INIT_MMX
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_internal_mmxext
+cglobal pixel_sa8d_8x8_internal_mmxext
      push   r0
      push   r2
      sub    esp, 0x74
@@ -169,9 +169,9 @@ cglobal x264_pixel_sa8d_8x8_internal_mmxext
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
  ;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_mmxext
+cglobal intra_sa8d_x3_8x8_core_mmxext
      mov    eax, [esp+4]
      mov    ecx, [esp+8]
      sub    esp, 0x70
@@ -329,10 +329,10 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext
  
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
-;                                         const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+;                             const uint8_t *pix2, int stride2, int sums[2][4] )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_mmxext
+cglobal pixel_ssim_4x4x2_core_mmxext
      push     ebx
      push     edi
      mov      ebx, [esp+16]
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm

index 2c8284d39efe7b5dac9c00e0ffd52287270a8cff..c9493f4340be4f4125f2f91abe1e14ea4c6658f4 100644 (file)
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -27,17 +27,14 @@
  %include "x86inc.asm"
  %include "x86util.asm"
  
-SECTION_RODATA
-pw_1:      times 8 dw 1
-pw_00ff:   times 8 dw 0xff
-ssim_c1:   times 4 dd 416    ; .01*.01*255*255*64
-ssim_c2:   times 4 dd 235963 ; .03*.03*255*255*64*63
+SECTION_RODATA 32
  mask_ff:   times 16 db 0xff
             times 16 db 0
+ssim_c1:   times 4 dd 416    ; .01*.01*255*255*64
+ssim_c2:   times 4 dd 235963 ; .03*.03*255*255*64*63
  mask_ac4:  dw 0, -1, -1, -1, 0, -1, -1, -1
  mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
  mask_ac8:  dw 0, -1, -1, -1, -1, -1, -1, -1
-hsub_mul:  times 8 db 1, -1
  hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
  hmul_8p:   times 8 db 1
             times 4 db 1, -1
@@ -46,6 +43,11 @@ mask_1100: times 2 dd 0, -1
  
  SECTION .text
  
+cextern pw_1
+cextern pw_00ff
+
+cextern hsub_mul
+
  %macro HADDD 2 ; sum junk
  %if mmsize == 16
      movhlps %2, %1
@@ -213,7 +215,7 @@ SECTION .text
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
+; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
  %macro SSD 3-4 0
  %if %1 != %2
@@ -221,7 +223,7 @@ SECTION .text
  %else
      %assign function_align 16
  %endif
-cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+cglobal pixel_ssd_%1x%2_%3, 0,0,0
      mov     al, %1*%2/mmsize/2
  
  %if %1 != %2
@@ -365,21 +367,21 @@ SSD  4,  8, ssse3
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
+; int pixel_var_wxh( uint8_t *, int )
  ;-----------------------------------------------------------------------------
  INIT_MMX
-cglobal x264_pixel_var_16x16_mmxext, 2,3
+cglobal pixel_var_16x16_mmxext, 2,3
      VAR_START 0
      VAR_2ROW 8, 16
      VAR_END
  
-cglobal x264_pixel_var_8x8_mmxext, 2,3
+cglobal pixel_var_8x8_mmxext, 2,3
      VAR_START 0
      VAR_2ROW r1, 4
      VAR_END
  
  INIT_XMM
-cglobal x264_pixel_var_16x16_sse2, 2,3,8
+cglobal pixel_var_16x16_sse2, 2,3,8
      VAR_START 1
      mov      r2d, 8
  .loop:
@@ -392,7 +394,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8
      jg .loop
      VAR_END
  
-cglobal x264_pixel_var_8x8_sse2, 2,4,8
+cglobal pixel_var_8x8_sse2, 2,4,8
      VAR_START 1
      mov      r2d, 2
      lea       r3, [r1*3]
@@ -421,11 +423,11 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+; int pixel_var2_8x8( uint8_t *, int, uint8_t *, int, int * )
  ;-----------------------------------------------------------------------------
  %ifndef ARCH_X86_64
  INIT_MMX
-cglobal x264_pixel_var2_8x8_mmxext, 5,6
+cglobal pixel_var2_8x8_mmxext, 5,6
      VAR_START 0
      mov      r5d, 8
  .loop:
@@ -455,7 +457,7 @@ cglobal x264_pixel_var2_8x8_mmxext, 5,6
  %endif
  
  INIT_XMM
-cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+cglobal pixel_var2_8x8_sse2, 5,6,8
      VAR_START 1
      mov      r5d, 4
  .loop:
@@ -479,7 +481,7 @@ cglobal x264_pixel_var2_8x8_sse2, 5,6,8
      VAR2_END
      RET
  
-cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+cglobal pixel_var2_8x8_ssse3, 5,6,8
      pxor      m5, m5    ; sum
      pxor      m6, m6    ; sum squared
      mova      m7, [hsub_mul]
@@ -692,10 +694,10 @@ cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
  ; for small blocks on x86_32, modify pixel pointer instead.
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
  INIT_MMX
-cglobal x264_pixel_satd_16x4_internal_mmxext
+cglobal pixel_satd_16x4_internal_mmxext
      SATD_4x4_MMX m2,  0, 0
      SATD_4x4_MMX m1,  4, 0
      paddw        m0, m2
@@ -706,69 +708,69 @@ cglobal x264_pixel_satd_16x4_internal_mmxext
      paddw        m0, m1
      ret
  
-cglobal x264_pixel_satd_8x8_internal_mmxext
+cglobal pixel_satd_8x8_internal_mmxext
      SATD_4x4_MMX m2,  0, 0
      SATD_4x4_MMX m1,  4, 1
      paddw        m0, m2
      paddw        m0, m1
-x264_pixel_satd_8x4_internal_mmxext:
+pixel_satd_8x4_internal_mmxext:
      SATD_4x4_MMX m2,  0, 0
      SATD_4x4_MMX m1,  4, 0
      paddw        m0, m2
      paddw        m0, m1
      ret
  
-cglobal x264_pixel_satd_16x16_mmxext, 4,6
+cglobal pixel_satd_16x16_mmxext, 4,6
      SATD_START_MMX
      pxor   m0, m0
  %rep 3
-    call x264_pixel_satd_16x4_internal_mmxext
+    call pixel_satd_16x4_internal_mmxext
      lea  r0, [r0+4*r1]
      lea  r2, [r2+4*r3]
  %endrep
-    call x264_pixel_satd_16x4_internal_mmxext
+    call pixel_satd_16x4_internal_mmxext
      HADDUW m0, m1
      movd  eax, m0
      RET
  
-cglobal x264_pixel_satd_16x8_mmxext, 4,6
+cglobal pixel_satd_16x8_mmxext, 4,6
      SATD_START_MMX
      pxor   m0, m0
-    call x264_pixel_satd_16x4_internal_mmxext
+    call pixel_satd_16x4_internal_mmxext
      lea  r0, [r0+4*r1]
      lea  r2, [r2+4*r3]
-    call x264_pixel_satd_16x4_internal_mmxext
+    call pixel_satd_16x4_internal_mmxext
      SATD_END_MMX
  
-cglobal x264_pixel_satd_8x16_mmxext, 4,6
+cglobal pixel_satd_8x16_mmxext, 4,6
      SATD_START_MMX
      pxor   m0, m0
-    call x264_pixel_satd_8x8_internal_mmxext
+    call pixel_satd_8x8_internal_mmxext
      lea  r0, [r0+4*r1]
      lea  r2, [r2+4*r3]
-    call x264_pixel_satd_8x8_internal_mmxext
+    call pixel_satd_8x8_internal_mmxext
      SATD_END_MMX
  
-cglobal x264_pixel_satd_8x8_mmxext, 4,6
+cglobal pixel_satd_8x8_mmxext, 4,6
      SATD_START_MMX
      pxor   m0, m0
-    call x264_pixel_satd_8x8_internal_mmxext
+    call pixel_satd_8x8_internal_mmxext
      SATD_END_MMX
  
-cglobal x264_pixel_satd_8x4_mmxext, 4,6
+cglobal pixel_satd_8x4_mmxext, 4,6
      SATD_START_MMX
      pxor   m0, m0
-    call x264_pixel_satd_8x4_internal_mmxext
+    call pixel_satd_8x4_internal_mmxext
      SATD_END_MMX
  
-cglobal x264_pixel_satd_4x8_mmxext, 4,6
+cglobal pixel_satd_4x8_mmxext, 4,6
      SATD_START_MMX
      SATD_4x4_MMX m0, 0, 1
      SATD_4x4_MMX m1, 0, 0
      paddw  m0, m1
      SATD_END_MMX
  
-cglobal x264_pixel_satd_4x4_mmxext, 4,6
+cglobal pixel_satd_4x4_mmxext, 4,6
      SATD_START_MMX
      SATD_4x4_MMX m0, 0, 0
      SATD_END_MMX
@@ -808,12 +810,12 @@ cglobal x264_pixel_satd_4x4_mmxext, 4,6
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
  %macro SATDS_SSE2 1
  INIT_XMM
  %ifnidn %1, sse2
-cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
+cglobal pixel_satd_4x4_%1, 4, 6, 6
      SATD_START_MMX
      mova m4, [hmul_4p]
      LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
@@ -829,7 +831,7 @@ cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
      RET
  %endif
  
-cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
+cglobal pixel_satd_4x8_%1, 4, 6, 8
      SATD_START_MMX
  %ifnidn %1, sse2
      mova m7, [hmul_4p]
@@ -869,16 +871,16 @@ cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
      movd eax, m6
      RET
  
-cglobal x264_pixel_satd_8x8_internal_%1
+cglobal pixel_satd_8x8_internal_%1
      LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
      SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
-x264_pixel_satd_8x4_internal_%1:
+pixel_satd_8x4_internal_%1:
      LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
      SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 6
      ret
  
  %ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
-cglobal x264_pixel_satd_16x4_internal_%1
+cglobal pixel_satd_16x4_internal_%1
      LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
      lea  r2, [r2+4*r3]
      lea  r0, [r0+4*r1]
@@ -886,67 +888,67 @@ cglobal x264_pixel_satd_16x4_internal_%1
      SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
      ret
  
-cglobal x264_pixel_satd_16x8_%1, 4,6,12
+cglobal pixel_satd_16x8_%1, 4,6,12
      SATD_START_SSE2 %1, m10, m7
  %ifidn %1, sse2
      mova m7, [pw_00ff]
  %endif
-    jmp x264_pixel_satd_16x8_internal_%1
+    jmp pixel_satd_16x8_internal_%1
  
-cglobal x264_pixel_satd_16x16_%1, 4,6,12
+cglobal pixel_satd_16x16_%1, 4,6,12
      SATD_START_SSE2 %1, m10, m7
  %ifidn %1, sse2
      mova m7, [pw_00ff]
  %endif
-    call x264_pixel_satd_16x4_internal_%1
-    call x264_pixel_satd_16x4_internal_%1
-x264_pixel_satd_16x8_internal_%1:
-    call x264_pixel_satd_16x4_internal_%1
-    call x264_pixel_satd_16x4_internal_%1
+    call pixel_satd_16x4_internal_%1
+    call pixel_satd_16x4_internal_%1
+pixel_satd_16x8_internal_%1:
+    call pixel_satd_16x4_internal_%1
+    call pixel_satd_16x4_internal_%1
      SATD_END_SSE2 %1, m10
  %else
-cglobal x264_pixel_satd_16x8_%1, 4,6,8
+cglobal pixel_satd_16x8_%1, 4,6,8
      SATD_START_SSE2 %1, m6, m7
      BACKUP_POINTERS
-    call x264_pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
      RESTORE_AND_INC_POINTERS
-    call x264_pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
      SATD_END_SSE2 %1, m6
  
-cglobal x264_pixel_satd_16x16_%1, 4,6,8
+cglobal pixel_satd_16x16_%1, 4,6,8
      SATD_START_SSE2 %1, m6, m7
      BACKUP_POINTERS
-    call x264_pixel_satd_8x8_internal_%1
-    call x264_pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
      RESTORE_AND_INC_POINTERS
-    call x264_pixel_satd_8x8_internal_%1
-    call x264_pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
      SATD_END_SSE2 %1, m6
  %endif
  
-cglobal x264_pixel_satd_8x16_%1, 4,6,8
+cglobal pixel_satd_8x16_%1, 4,6,8
      SATD_START_SSE2 %1, m6, m7
-    call x264_pixel_satd_8x8_internal_%1
-    call x264_pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
      SATD_END_SSE2 %1, m6
  
-cglobal x264_pixel_satd_8x8_%1, 4,6,8
+cglobal pixel_satd_8x8_%1, 4,6,8
      SATD_START_SSE2 %1, m6, m7
-    call x264_pixel_satd_8x8_internal_%1
+    call pixel_satd_8x8_internal_%1
      SATD_END_SSE2 %1, m6
  
-cglobal x264_pixel_satd_8x4_%1, 4,6,8
+cglobal pixel_satd_8x4_%1, 4,6,8
      SATD_START_SSE2 %1, m6, m7
-    call x264_pixel_satd_8x4_internal_%1
+    call pixel_satd_8x4_internal_%1
      SATD_END_SSE2 %1, m6
  %endmacro ; SATDS_SSE2
  
  %macro SA8D 1
  %ifdef ARCH_X86_64
  ;-----------------------------------------------------------------------------
-; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal_%1
      lea  r10, [r0+4*r1]
      lea  r11, [r2+4*r3]
      LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
@@ -970,41 +972,41 @@ cglobal x264_pixel_sa8d_8x8_internal_%1
      paddw m0, m1
      paddw m0, m2
      paddw m0, m8
-    SAVE_MM_PERMUTATION x264_pixel_sa8d_8x8_internal_%1
+    SAVE_MM_PERMUTATION pixel_sa8d_8x8_internal_%1
      ret
  
-cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
+cglobal pixel_sa8d_8x8_%1, 4,6,12
      lea  r4, [3*r1]
      lea  r5, [3*r3]
  %ifnidn %1, sse2
      mova m7, [hmul_8p]
  %endif
-    call x264_pixel_sa8d_8x8_internal_%1
+    call pixel_sa8d_8x8_internal_%1
      HADDW m0, m1
      movd eax, m0
      add eax, 1
      shr eax, 1
      RET
  
-cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
+cglobal pixel_sa8d_16x16_%1, 4,6,12
      lea  r4, [3*r1]
      lea  r5, [3*r3]
  %ifnidn %1, sse2
      mova m7, [hmul_8p]
  %endif
-    call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
+    call pixel_sa8d_8x8_internal_%1 ; pix[0]
      add  r2, 8
      add  r0, 8
      mova m10, m0
-    call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
+    call pixel_sa8d_8x8_internal_%1 ; pix[8]
      lea  r2, [r2+8*r3]
      lea  r0, [r0+8*r1]
      paddusw m10, m0
-    call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
+    call pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
      sub  r2, 8
      sub  r0, 8
      paddusw m10, m0
-    call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
+    call pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
      paddusw m0, m10
      HADDUW m0, m1
      movd eax, m0
@@ -1014,7 +1016,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
  
  %else ; ARCH_X86_32
  %ifnidn %1, mmxext
-cglobal x264_pixel_sa8d_8x8_internal_%1
+cglobal pixel_sa8d_8x8_internal_%1
      %define spill0 [esp+4]
      %define spill1 [esp+20]
      %define spill2 [esp+36]
@@ -1064,13 +1066,13 @@ cglobal x264_pixel_sa8d_8x8_internal_%1
      ret
  %endif ; ifndef mmxext
  
-cglobal x264_pixel_sa8d_8x8_%1, 4,7
+cglobal pixel_sa8d_8x8_%1, 4,7
      mov  r6, esp
      and  esp, ~15
      sub  esp, 48
      lea  r4, [3*r1]
      lea  r5, [3*r3]
-    call x264_pixel_sa8d_8x8_internal_%1
+    call pixel_sa8d_8x8_internal_%1
      HADDW m0, m1
      movd eax, m0
      add  eax, 1
@@ -1078,26 +1080,26 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,7
      mov  esp, r6
      RET
  
-cglobal x264_pixel_sa8d_16x16_%1, 4,7
+cglobal pixel_sa8d_16x16_%1, 4,7
      mov  r6, esp
      and  esp, ~15
      sub  esp, 64
      lea  r4, [3*r1]
      lea  r5, [3*r3]
-    call x264_pixel_sa8d_8x8_internal_%1
+    call pixel_sa8d_8x8_internal_%1
  %ifidn %1, mmxext
      lea  r0, [r0+4*r1]
      lea  r2, [r2+4*r3]
  %endif
      mova [esp+48], m0
-    call x264_pixel_sa8d_8x8_internal_%1
+    call pixel_sa8d_8x8_internal_%1
      mov  r0, [r6+20]
      mov  r2, [r6+28]
      add  r0, 8
      add  r2, 8
      paddusw m0, [esp+48]
      mova [esp+48], m0
-    call x264_pixel_sa8d_8x8_internal_%1
+    call pixel_sa8d_8x8_internal_%1
  %ifidn %1, mmxext
      lea  r0, [r0+4*r1]
      lea  r2, [r2+4*r3]
@@ -1106,7 +1108,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,7
      paddusw m0, [esp+48]
  %endif
      mova [esp+64-mmsize], m0
-    call x264_pixel_sa8d_8x8_internal_%1
+    call pixel_sa8d_8x8_internal_%1
      paddusw m0, [esp+64-mmsize]
  %if mmsize == 16
      HADDUW m0, m1
@@ -1140,9 +1142,9 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,7
  %ifdef ARCH_X86_64
  INIT_XMM
  ;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
+; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
  ;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
+cglobal intra_sa8d_x3_8x8_core_%1, 3,3,16
      ; 8x8 hadamard
      pxor        m8, m8
      movq        m0, [r0+0*FENC_STRIDE]
@@ -1247,7 +1249,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
  ; in: r0 = fenc
  ; out: m0..m3 = hadamard coefs
  INIT_MMX
-cglobal x264_hadamard_load
+cglobal hadamard_load
  ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
      pxor        m7, m7
      movd        m0, [r0+0*FENC_STRIDE]
@@ -1259,7 +1261,7 @@ cglobal x264_hadamard_load
      punpcklbw   m2, m7
      punpcklbw   m3, m7
      HADAMARD4_2D 0, 1, 2, 3, 4
-    SAVE_MM_PERMUTATION x264_hadamard_load
+    SAVE_MM_PERMUTATION hadamard_load
      ret
  
  %macro SCALAR_SUMSUB 4
@@ -1377,9 +1379,9 @@ cglobal x264_hadamard_load
  %macro INTRA_SATDS_MMX 1
  INIT_MMX
  ;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
  ;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_4x4_%1, 2,6
+cglobal intra_satd_x3_4x4_%1, 2,6
  %ifdef ARCH_X86_64
      ; stack is 16 byte aligned because abi says so
      %define  top_1d  rsp-8  ; size 8
@@ -1393,7 +1395,7 @@ cglobal x264_intra_satd_x3_4x4_%1, 2,6
      %define  t0 r2
  %endif
  
-    call x264_hadamard_load
+    call hadamard_load
      SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
      mov         t0d, r0d
      SCALAR_HADAMARD_TOP  0, r0, r3, r4, r5
@@ -1430,9 +1432,9 @@ cglobal x264_intra_satd_x3_4x4_%1, 2,6
  %endif
  
  ;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
  ;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_16x16_%1, 0,7
+cglobal intra_satd_x3_16x16_%1, 0,7
  %ifdef ARCH_X86_64
      %assign  stack_pad  88
  %else
@@ -1466,7 +1468,7 @@ cglobal x264_intra_satd_x3_16x16_%1, 0,7
  .loop_y:
      xor         r4d, r4d
  .loop_x:
-    call x264_hadamard_load
+    call hadamard_load
  
      SUM3x4 %1
      SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
@@ -1507,9 +1509,9 @@ cglobal x264_intra_satd_x3_16x16_%1, 0,7
      RET
  
  ;-----------------------------------------------------------------------------
-; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
  ;-----------------------------------------------------------------------------
-cglobal x264_intra_satd_x3_8x8c_%1, 0,6
+cglobal intra_satd_x3_8x8c_%1, 0,6
      ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
      SUB          rsp, 72
  %define  sums    rsp+48 ; size 24
@@ -1555,7 +1557,7 @@ cglobal x264_intra_satd_x3_8x8c_%1, 0,6
  .loop_y:
      xor         r4d, r4d
  .loop_x:
-    call x264_hadamard_load
+    call hadamard_load
  
      SUM3x4 %1
      SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
@@ -1609,7 +1611,7 @@ cglobal x264_intra_satd_x3_8x8c_%1, 0,6
  
  ; in:  r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
  ; out: [tmp]=hadamard4, m0=satd
-cglobal x264_hadamard_ac_4x4_mmxext
+cglobal hadamard_ac_4x4_mmxext
      movh      m0, [r0]
      movh      m1, [r0+r1]
      movh      m2, [r0+r1*2]
@@ -1631,10 +1633,10 @@ cglobal x264_hadamard_ac_4x4_mmxext
      paddw     m0, m1
      paddw     m2, m3
      paddw     m0, m2
-    SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
+    SAVE_MM_PERMUTATION hadamard_ac_4x4_mmxext
      ret
  
-cglobal x264_hadamard_ac_2x2max_mmxext
+cglobal hadamard_ac_2x2max_mmxext
      mova      m0, [r3+0x00]
      mova      m1, [r3+0x20]
      mova      m2, [r3+0x40]
@@ -1646,30 +1648,30 @@ cglobal x264_hadamard_ac_2x2max_mmxext
      HADAMARD 0, max, 1, 3, 4, 5
      paddw     m7, m0
      paddw     m7, m1
-    SAVE_MM_PERMUTATION x264_hadamard_ac_2x2max_mmxext
+    SAVE_MM_PERMUTATION hadamard_ac_2x2max_mmxext
      ret
  
-cglobal x264_hadamard_ac_8x8_mmxext
+cglobal hadamard_ac_8x8_mmxext
      mova      m6, [mask_ac4]
      pxor      m7, m7
-    call x264_hadamard_ac_4x4_mmxext
+    call hadamard_ac_4x4_mmxext
      add       r0, 4
      add       r3, 32
      mova      m5, m0
-    call x264_hadamard_ac_4x4_mmxext
+    call hadamard_ac_4x4_mmxext
      lea       r0, [r0+4*r1]
      add       r3, 64
      paddw     m5, m0
-    call x264_hadamard_ac_4x4_mmxext
+    call hadamard_ac_4x4_mmxext
      sub       r0, 4
      sub       r3, 32
      paddw     m5, m0
-    call x264_hadamard_ac_4x4_mmxext
+    call hadamard_ac_4x4_mmxext
      paddw     m5, m0
      sub       r3, 40
      mova [rsp+gprsize+8], m5 ; save satd
  %rep 3
-    call x264_hadamard_ac_2x2max_mmxext
+    call hadamard_ac_2x2max_mmxext
  %endrep
      mova      m0, [r3+0x00]
      mova      m1, [r3+0x20]
@@ -1686,33 +1688,33 @@ cglobal x264_hadamard_ac_8x8_mmxext
      paddw     m6, m7
      mova [rsp+gprsize], m6 ; save sa8d
      SWAP      m0, m6
-    SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
+    SAVE_MM_PERMUTATION hadamard_ac_8x8_mmxext
      ret
  
  %macro HADAMARD_AC_WXH_MMX 2
-cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
+cglobal pixel_hadamard_ac_%1x%2_mmxext, 2,4
      %assign pad 16-gprsize-(stack_offset&15)
      %define ysub r1
      sub  rsp, 16+128+pad
      lea  r2, [r1*3]
      lea  r3, [rsp+16]
-    call x264_hadamard_ac_8x8_mmxext
+    call hadamard_ac_8x8_mmxext
  %if %2==16
      %define ysub r2
      lea  r0, [r0+r1*4]
      sub  rsp, 16
-    call x264_hadamard_ac_8x8_mmxext
+    call hadamard_ac_8x8_mmxext
  %endif
  %if %1==16
      neg  ysub
      sub  rsp, 16
      lea  r0, [r0+ysub*4+8]
      neg  ysub
-    call x264_hadamard_ac_8x8_mmxext
+    call hadamard_ac_8x8_mmxext
  %if %2==16
      lea  r0, [r0+r1*4]
      sub  rsp, 16
-    call x264_hadamard_ac_8x8_mmxext
+    call hadamard_ac_8x8_mmxext
  %endif
  %endif
      mova    m1, [rsp+0x08]
@@ -1779,7 +1781,7 @@ HADAMARD_AC_WXH_MMX  8,  8
  INIT_XMM
  ; in:  r0=pix, r1=stride, r2=stride*3
  ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
-cglobal x264_hadamard_ac_8x8_%1
+cglobal hadamard_ac_8x8_%1
  %ifdef ARCH_X86_64
      %define spill0 m8
      %define spill1 m9
@@ -1883,7 +1885,7 @@ cglobal x264_hadamard_ac_8x8_%1
      paddw m2, m4
      paddw m0, m2
      mova  [rsp+gprsize+16], m0 ; save sa8d
-    SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
+    SAVE_MM_PERMUTATION hadamard_ac_8x8_%1
      ret
  
  HADAMARD_AC_WXH_SSE2 16, 16, %1
@@ -1892,30 +1894,30 @@ HADAMARD_AC_WXH_SSE2 16,  8, %1
  HADAMARD_AC_WXH_SSE2  8,  8, %1
  %endmacro ; HADAMARD_AC_SSE2
  
-; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
+; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
  %macro HADAMARD_AC_WXH_SSE2 3
-cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
+cglobal pixel_hadamard_ac_%1x%2_%3, 2,3,11
      %assign pad 16-gprsize-(stack_offset&15)
      %define ysub r1
      sub  rsp, 48+pad
      lea  r2, [r1*3]
-    call x264_hadamard_ac_8x8_%3
+    call hadamard_ac_8x8_%3
  %if %2==16
      %define ysub r2
      lea  r0, [r0+r1*4]
      sub  rsp, 32
-    call x264_hadamard_ac_8x8_%3
+    call hadamard_ac_8x8_%3
  %endif
  %if %1==16
      neg  ysub
      sub  rsp, 32
      lea  r0, [r0+ysub*4+8]
      neg  ysub
-    call x264_hadamard_ac_8x8_%3
+    call hadamard_ac_8x8_%3
  %if %2==16
      lea  r0, [r0+r1*4]
      sub  rsp, 32
-    call x264_hadamard_ac_8x8_%3
+    call hadamard_ac_8x8_%3
  %endif
  %endif
      mova    m1, [rsp+0x20]
@@ -1947,7 +1949,7 @@ cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
  ; instantiate satds
  
  %ifndef ARCH_X86_64
-cextern x264_pixel_sa8d_8x8_internal_mmxext
+cextern pixel_sa8d_8x8_internal_mmxext
  SA8D mmxext
  %endif
  
@@ -1999,8 +2001,8 @@ HADAMARD_AC_SSE2 sse4
  ;=============================================================================
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
-;                                       const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
+;                             const uint8_t *pix2, int stride2, int sums[2][4] )
  ;-----------------------------------------------------------------------------
  
  %macro SSIM_ITER 1
@@ -2033,7 +2035,7 @@ HADAMARD_AC_SSE2 sse4
      paddd     m3, m6
  %endmacro
  
-cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
+cglobal pixel_ssim_4x4x2_core_sse2, 4,4,8
      pxor      m0, m0
      SSIM_ITER 0
      SSIM_ITER 1
@@ -2069,9 +2071,9 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
      RET
  
  ;-----------------------------------------------------------------------------
-; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
+; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_end4_sse2, 3,3,7
+cglobal pixel_ssim_end4_sse2, 3,3,7
      movdqa    m0, [r0+ 0]
      movdqa    m1, [r0+16]
      movdqa    m2, [r0+32]
@@ -2175,10 +2177,10 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
  %define ABS1 ABS1_MMX
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-;                             uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
+; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
+;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_ads4_mmxext, 4,7
+cglobal pixel_ads4_mmxext, 4,7
      movq    mm6, [r0]
      movq    mm4, [r0+8]
      pshufw  mm7, mm6, 0
@@ -2215,7 +2217,7 @@ cglobal x264_pixel_ads4_mmxext, 4,7
      movd    [t0], mm1
      ADS_END 1
  
-cglobal x264_pixel_ads2_mmxext, 4,7
+cglobal pixel_ads2_mmxext, 4,7
      movq    mm6, [r0]
      pshufw  mm5, r6m, 0
      pshufw  mm7, mm6, 0
@@ -2236,7 +2238,7 @@ cglobal x264_pixel_ads2_mmxext, 4,7
      movd    [t0], mm4
      ADS_END 1
  
-cglobal x264_pixel_ads1_mmxext, 4,7
+cglobal pixel_ads1_mmxext, 4,7
      pshufw  mm7, [r0], 0
      pshufw  mm6, r6m, 0
      ADS_START 2
@@ -2258,7 +2260,7 @@ cglobal x264_pixel_ads1_mmxext, 4,7
      ADS_END 2
  
  %macro ADS_SSE2 1
-cglobal x264_pixel_ads4_%1, 4,7,12
+cglobal pixel_ads4_%1, 4,7,12
      movdqa  xmm4, [r0]
      pshuflw xmm7, xmm4, 0
      pshuflw xmm6, xmm4, 0xAA
@@ -2327,7 +2329,7 @@ cglobal x264_pixel_ads4_%1, 4,7,12
  %endif ; ARCH
      ADS_END 2
  
-cglobal x264_pixel_ads2_%1, 4,7,8
+cglobal pixel_ads2_%1, 4,7,8
      movq    xmm6, [r0]
      movd    xmm5, r6m
      pshuflw xmm7, xmm6, 0
@@ -2353,7 +2355,7 @@ cglobal x264_pixel_ads2_%1, 4,7,8
      movq    [t0], xmm1
      ADS_END 2
  
-cglobal x264_pixel_ads1_%1, 4,7,8
+cglobal pixel_ads1_%1, 4,7,8
      movd    xmm7, [r0]
      movd    xmm6, r6m
      pshuflw xmm7, xmm7, 0
@@ -2385,7 +2387,7 @@ ADS_SSE2 sse2
  %define ABS1 ABS1_SSSE3
  ADS_SSE2 ssse3
  
-; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
  ; {
  ;     int nmv=0, i, j;
  ;     *(uint32_t*)(masks+width) = 0;
@@ -2399,7 +2401,7 @@ ADS_SSE2 ssse3
  ;     }
  ;     return nmv;
  ; }
-cglobal x264_pixel_ads_mvs, 0,7,0
+cglobal pixel_ads_mvs, 0,7,0
  ads_mvs:
  %ifdef ARCH_X86_64
      ; mvs = r4
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm

index a58dbbd89948252eb7e73b10115a2faa9b020517..577a21c1deb1b6868b6b61cd6223cbad7bb2b96a 100644 (file)
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -25,6 +25,24 @@
  %include "x86inc.asm"
  %include "x86util.asm"
  
+SECTION_RODATA
+
+pw_76543210:
+pw_3210:    dw 0, 1, 2, 3, 4, 5, 6, 7
+pb_00s_ff:  times 8 db 0
+pb_0s_ff:   times 7 db 0
+            db 0xff
+
+SECTION .text
+
+cextern pb_1
+cextern pb_3
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_ff00
+cextern pb_reverse
+
  %macro STORE8x8 2
      add r0, 4*FDEC_STRIDE
      movq        [r0 + -4*FDEC_STRIDE], %1
@@ -74,24 +92,6 @@
      movdqa      [r0 +  3*FDEC_STRIDE], %1
  %endmacro
  
-SECTION_RODATA
-
-ALIGN 16
-pb_1:       times 16 db 1
-pb_3:       times 16 db 3
-pw_2:       times 4 dw 2
-pw_4:       times 4 dw 4
-pw_8:       times 8 dw 8
-pw_76543210:
-pw_3210:    dw 0, 1, 2, 3, 4, 5, 6, 7
-pb_00s_ff:  times 8 db 0
-pb_0s_ff:   times 7 db 0
-            db 0xff
-pw_ff00:    times 8 dw 0xff00
-pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
  ; dest, left, right, src, tmp
  ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  %macro PRED8x8_LOWPASS0 6
@@ -126,7 +126,7 @@ SECTION .text
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void predict_4x4_ddl_mmxext( uint8_t *src )
+; void predict_4x4_ddl( uint8_t *src )
  ;-----------------------------------------------------------------------------
  cglobal predict_4x4_ddl_mmxext, 1,1
      movq    mm1, [r0-FDEC_STRIDE]
@@ -149,7 +149,7 @@ cglobal predict_4x4_ddl_mmxext, 1,1
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_4x4_ddr_mmxext( uint8_t *src )
+; void predict_4x4_ddr( uint8_t *src )
  ;-----------------------------------------------------------------------------
  %macro PREDICT_4x4 1
  cglobal predict_4x4_ddr_%1, 1,1
@@ -233,7 +233,7 @@ PREDICT_4x4 mmxext
  PREDICT_4x4 ssse3
  
  ;-----------------------------------------------------------------------------
-; void predict_4x4_hu_mmxext( uint8_t *src )
+; void predict_4x4_hu( uint8_t *src )
  ;-----------------------------------------------------------------------------
  cglobal predict_4x4_hu_mmxext, 1,1
      movq      mm0, [r0+0*FDEC_STRIDE-8]
@@ -264,7 +264,7 @@ cglobal predict_4x4_hu_mmxext, 1,1
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_4x4_vl_mmxext( uint8_t *src )
+; void predict_4x4_vl( uint8_t *src )
  ;-----------------------------------------------------------------------------
  cglobal predict_4x4_vl_mmxext, 1,1
      movq        mm1, [r0-FDEC_STRIDE]
@@ -426,7 +426,7 @@ PREDICT_FILTER mmxext
  PREDICT_FILTER ssse3
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_v( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_v_mmxext, 2,2
      movq        mm0, [r1+16]
@@ -434,7 +434,7 @@ cglobal predict_8x8_v_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
+; void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
  ;-----------------------------------------------------------------------------
  
  INIT_MMX
@@ -459,7 +459,7 @@ cglobal predict_8x8_h_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_dc( uint8_t *src, uint8_t *edge );
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_dc_mmxext, 2,2
      pxor        mm0, mm0
@@ -475,7 +475,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
+; void predict_8x8_dc_top( uint8_t *src, uint8_t *edge );
  ;-----------------------------------------------------------------------------
  %macro PRED8x8_DC 2
  cglobal %1, 2,2
@@ -497,7 +497,7 @@ PRED8x8_DC predict_8x8_dc_left_mmxext, 7
  ; functions if we know sse2 is available.
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_ddl_mmxext, 2,2
      movq        mm5, [r1+16]
@@ -528,7 +528,7 @@ cglobal predict_8x8_ddl_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_ddr_mmxext, 2,2
      movq        mm1, [r1+7]
@@ -557,7 +557,7 @@ cglobal predict_8x8_ddr_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  %define PALIGNR PALIGNR_MMX
  cglobal predict_8x8_hu_mmxext, 2,2
@@ -602,7 +602,7 @@ cglobal predict_8x8_hu_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr_core( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  
  ; fills only some pixels:
@@ -639,7 +639,7 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8c_p_core_mmxext, 1,2
      LOAD_PLANE_ARGS
@@ -667,7 +667,7 @@ ALIGN 4
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
+; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
  ;-----------------------------------------------------------------------------
  cglobal predict_16x16_p_core_mmxext, 1,2
      LOAD_PLANE_ARGS
@@ -711,7 +711,7 @@ ALIGN 4
  %endif ; !ARCH_X86_64
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_ddl_sse2, 2,2
      movdqa      xmm3, [r1+16]
@@ -730,7 +730,7 @@ cglobal predict_8x8_ddl_sse2, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_ddr_sse2, 2,2
      movdqu      xmm3, [r1+8]
@@ -756,7 +756,7 @@ cglobal predict_8x8_ddr_sse2, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_vl_sse2, 2,2
      movdqa      xmm4, [r1+16]
@@ -786,7 +786,7 @@ cglobal predict_8x8_vl_sse2, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8_vr_sse2, 2,2,7
      movdqu      xmm0, [r1+8]
@@ -821,7 +821,7 @@ cglobal predict_8x8_vr_sse2, 2,2,7
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  %define PALIGNR PALIGNR_MMX
  cglobal predict_8x8_hd_mmxext, 2,2
@@ -868,7 +868,7 @@ cglobal predict_8x8_hd_mmxext, 2,2
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  %macro PREDICT_8x8_HD 1
  cglobal predict_8x8_hd_%1, 2,2
@@ -907,7 +907,7 @@ INIT_MMX
  %define PALIGNR PALIGNR_MMX
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
+; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
  ;-----------------------------------------------------------------------------
  %macro PREDICT_8x8_HU 1
  cglobal predict_8x8_hu_%1, 2,2
@@ -969,7 +969,7 @@ PREDICT_8x8_HU sse2
  PREDICT_8x8_HU ssse3
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8c_v_mmx( uint8_t *src )
+; void predict_8x8c_v( uint8_t *src )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8c_v_mmx, 1,1
      movq        mm0, [r0 - FDEC_STRIDE]
@@ -977,7 +977,7 @@ cglobal predict_8x8c_v_mmx, 1,1
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8c_h_mmxext( uint8_t *src )
+; void predict_8x8c_h( uint8_t *src )
  ;-----------------------------------------------------------------------------
  
  %macro PRED_8x8C_H 1
@@ -1001,7 +1001,7 @@ PRED_8x8C_H mmxext
  PRED_8x8C_H ssse3
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
+; void predict_8x8c_dc_core( uint8_t *src, int s2, int s3 )
  ;-----------------------------------------------------------------------------
  cglobal predict_8x8c_dc_core_mmxext, 1,1
      movq        mm0, [r0 - FDEC_STRIDE]
@@ -1056,7 +1056,7 @@ cglobal predict_8x8c_dc_top_mmxext, 1,1
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
+; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
  ;-----------------------------------------------------------------------------
  
  cglobal predict_8x8c_p_core_sse2, 1,1
@@ -1098,7 +1098,7 @@ call .loop
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
+; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
  ;-----------------------------------------------------------------------------
  cglobal predict_16x16_p_core_sse2, 1,2,8
      movd        xmm0, r1m
@@ -1142,7 +1142,7 @@ ALIGN 4
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_v_mmx( uint8_t *src )
+; void predict_16x16_v( uint8_t *src )
  ;-----------------------------------------------------------------------------
  cglobal predict_16x16_v_mmx, 1,2
      movq        mm0, [r0 - FDEC_STRIDE]
@@ -1151,7 +1151,7 @@ cglobal predict_16x16_v_mmx, 1,2
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_v_sse2( uint8_t *src )
+; void predict_16x16_v( uint8_t *src )
  ;-----------------------------------------------------------------------------
  cglobal predict_16x16_v_sse2, 1,1
      movdqa      xmm0, [r0 - FDEC_STRIDE]
@@ -1159,7 +1159,7 @@ cglobal predict_16x16_v_sse2, 1,1
      RET
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_h_mmxext( uint8_t *src )
+; void predict_16x16_h( uint8_t *src )
  ;-----------------------------------------------------------------------------
  
  %macro PRED_16x16_H 1
@@ -1192,7 +1192,7 @@ INIT_XMM
  PRED_16x16_H ssse3
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
+; void predict_16x16_dc_core( uint8_t *src, int i_dc_left )
  ;-----------------------------------------------------------------------------
  
  %macro PRED16x16_DC 2
@@ -1229,7 +1229,7 @@ cglobal predict_16x16_dc_left_core_mmxext, 1,1
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
+; void predict_16x16_dc_core( uint8_t *src, int i_dc_left )
  ;-----------------------------------------------------------------------------
  
  %macro PRED16x16_DC_SSE2 2
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c

index 602ddcdcfcd08c97c6ecc865980cfd6c6fe14e66..6fa7e3b2a64569a8efb9379a828f1b7b017ad4cd 100644 (file)
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -25,55 +25,55 @@
  #include "predict.h"
  #include "pixel.h"
  
-extern void predict_16x16_v_mmx( uint8_t *src );
-extern void predict_16x16_h_mmxext( uint8_t *src );
-extern void predict_16x16_h_ssse3( uint8_t *src );
-extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_top_mmxext( uint8_t *src );
-extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
-extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
-extern void predict_8x8c_dc_top_mmxext( uint8_t *src );
-extern void predict_8x8c_v_mmx( uint8_t *src );
-extern void predict_8x8c_h_mmxext( uint8_t *src );
-extern void predict_8x8c_h_ssse3( uint8_t *src );
-extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
-extern void predict_8x8_filter_mmxext   ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
-extern void predict_8x8_filter_ssse3   ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
-extern void predict_4x4_ddl_mmxext( uint8_t *src );
-extern void predict_4x4_ddr_mmxext( uint8_t *src );
-extern void predict_4x4_vl_mmxext( uint8_t *src );
-extern void predict_4x4_vr_mmxext( uint8_t *src );
-extern void predict_4x4_vr_ssse3( uint8_t *src );
-extern void predict_4x4_hd_mmxext( uint8_t *src );
-extern void predict_4x4_hd_ssse3( uint8_t *src );
-extern void predict_4x4_dc_mmxext( uint8_t *src );
-extern void predict_4x4_ddr_ssse3( uint8_t *src );
-extern void predict_4x4_hu_mmxext( uint8_t *src );
-extern void predict_16x16_dc_top_sse2( uint8_t *src );
-extern void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
-extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
-extern void predict_16x16_v_sse2( uint8_t *src );
-extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_16x16_v_mmx( uint8_t *src );
+ void x264_predict_16x16_h_mmxext( uint8_t *src );
+ void x264_predict_16x16_h_ssse3( uint8_t *src );
+ void x264_predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_mmxext( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_top_mmxext( uint8_t *src );
+ void x264_predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c );
+ void x264_predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
+ void x264_predict_8x8c_dc_top_mmxext( uint8_t *src );
+ void x264_predict_8x8c_v_mmx( uint8_t *src );
+ void x264_predict_8x8c_h_mmxext( uint8_t *src );
+ void x264_predict_8x8c_h_ssse3( uint8_t *src );
+ void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_sse2( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hd_ssse3( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_hu_ssse3( uint8_t *src, uint8_t edge[33] );
+ void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_8x8_filter_ssse3( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+ void x264_predict_4x4_ddl_mmxext( uint8_t *src );
+ void x264_predict_4x4_ddr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vl_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_mmxext( uint8_t *src );
+ void x264_predict_4x4_vr_ssse3( uint8_t *src );
+ void x264_predict_4x4_hd_mmxext( uint8_t *src );
+ void x264_predict_4x4_hd_ssse3( uint8_t *src );
+ void x264_predict_4x4_dc_mmxext( uint8_t *src );
+ void x264_predict_4x4_ddr_ssse3( uint8_t *src );
+ void x264_predict_4x4_hu_mmxext( uint8_t *src );
+ void x264_predict_16x16_dc_top_sse2( uint8_t *src );
+ void x264_predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
+ void x264_predict_16x16_v_sse2( uint8_t *src );
+ void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
  
  ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
  ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
@@ -84,7 +84,7 @@ ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
      V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
  
  #define PREDICT_16x16_P(name)\
-static void predict_16x16_p_##name( uint8_t *src )\
+static void x264_predict_16x16_p_##name( uint8_t *src )\
  {\
      int a, b, c;\
      int H = 0;\
@@ -102,7 +102,7 @@ static void predict_16x16_p_##name( uint8_t *src )\
      b = ( 5 * H + 32 ) >> 6;\
      c = ( 5 * V + 32 ) >> 6;\
      i00 = a - b * 7 - c * 7 + 16;\
-    predict_16x16_p_core_##name( src, i00, b, c );\
+    x264_predict_16x16_p_core_##name( src, i00, b, c );\
  }
  
  #ifndef ARCH_X86_64
@@ -111,7 +111,7 @@ PREDICT_16x16_P( mmxext )
  PREDICT_16x16_P( sse2   )
  
  #ifdef __GNUC__
-static void predict_16x16_p_ssse3( uint8_t *src )
+static void x264_predict_16x16_p_ssse3( uint8_t *src )
  {
      int a, b, c, i00;
      int H, V;
@@ -143,12 +143,12 @@ static void predict_16x16_p_ssse3( uint8_t *src )
      b = ( 5 * H + 32 ) >> 6;
      c = ( 5 * V + 32 ) >> 6;
      i00 = a - b * 7 - c * 7 + 16;
-    predict_16x16_p_core_sse2( src, i00, b, c );
+    x264_predict_16x16_p_core_sse2( src, i00, b, c );
  }
  #endif
  
  #define PREDICT_8x8_P(name)\
-static void predict_8x8c_p_##name( uint8_t *src )\
+static void x264_predict_8x8c_p_##name( uint8_t *src )\
  {\
      int a, b, c;\
      int H = 0;\
@@ -162,7 +162,7 @@ static void predict_8x8c_p_##name( uint8_t *src )\
      b = ( 17 * H + 16 ) >> 5;\
      c = ( 17 * V + 16 ) >> 5;\
      i00 = a -3*b -3*c + 16;\
-    predict_8x8c_p_core_##name( src, i00, b, c );\
+    x264_predict_8x8c_p_core_##name( src, i00, b, c );\
  }
  
  #ifndef ARCH_X86_64
@@ -171,7 +171,7 @@ PREDICT_8x8_P( mmxext )
  PREDICT_8x8_P( sse2   )
  
  #ifdef __GNUC__
-static void predict_8x8c_p_ssse3( uint8_t *src )
+static void x264_predict_8x8c_p_ssse3( uint8_t *src )
  {
      int a, b, c, i00;
      int H, V;
@@ -196,12 +196,12 @@ static void predict_8x8c_p_ssse3( uint8_t *src )
      b = ( 17 * H + 16 ) >> 5;
      c = ( 17 * V + 16 ) >> 5;
      i00 = a -3*b -3*c + 16;
-    predict_8x8c_p_core_sse2( src, i00, b, c );
+    x264_predict_8x8c_p_core_sse2( src, i00, b, c );
  }
  #endif
  
  #define PREDICT_16x16_DC(name)\
-static void predict_16x16_dc_##name( uint8_t *src )\
+static void x264_predict_16x16_dc_##name( uint8_t *src )\
  {\
      uint32_t dc=16;\
      int i;\
@@ -210,14 +210,14 @@ static void predict_16x16_dc_##name( uint8_t *src )\
          dc += src[-1 + i * FDEC_STRIDE];\
          dc += src[-1 + (i+1) * FDEC_STRIDE];\
      }\
-    predict_16x16_dc_core_##name( src, dc );\
+    x264_predict_16x16_dc_core_##name( src, dc );\
  }
  
  PREDICT_16x16_DC( mmxext )
  PREDICT_16x16_DC( sse2 )
  
  #define PREDICT_16x16_DC_LEFT(name)\
-static void predict_16x16_dc_left_##name( uint8_t *src )\
+static void x264_predict_16x16_dc_left_##name( uint8_t *src )\
  {\
      uint32_t dc=8;\
      int i;\
@@ -226,13 +226,13 @@ static void predict_16x16_dc_left_##name( uint8_t *src )\
          dc += src[-1 + i * FDEC_STRIDE];\
          dc += src[-1 + (i+1) * FDEC_STRIDE];\
      }\
-    predict_16x16_dc_left_core_##name( src, dc>>4 );\
+    x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
  }
  
  PREDICT_16x16_DC_LEFT( mmxext )
  PREDICT_16x16_DC_LEFT( sse2 )
  
-static void predict_8x8c_dc_mmxext( uint8_t *src )
+static void x264_predict_8x8c_dc_mmxext( uint8_t *src )
  {
      int s2 = 4
         + src[-1 + 0*FDEC_STRIDE]
@@ -246,11 +246,11 @@ static void predict_8x8c_dc_mmxext( uint8_t *src )
         + src[-1 + 6*FDEC_STRIDE]
         + src[-1 + 7*FDEC_STRIDE];
  
-    predict_8x8c_dc_core_mmxext( src, s2, s3 );
+    x264_predict_8x8c_dc_core_mmxext( src, s2, s3 );
  }
  
  #ifdef ARCH_X86_64
-static void predict_8x8c_dc_left( uint8_t *src )
+static void x264_predict_8x8c_dc_left( uint8_t *src )
  {
      int y;
      uint32_t s0 = 0, s1 = 0;
@@ -304,9 +304,9 @@ static void predict_8x8c_dc_left( uint8_t *src )
  #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
  
  #ifndef ARCH_X86_64
-static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
+static void x264_predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
  {
-    predict_8x8_vr_core_mmxext( src, edge );
+    x264_predict_8x8_vr_core_mmxext( src, edge );
      {
          PREDICT_8x8_LOAD_TOPLEFT
          PREDICT_8x8_LOAD_LEFT
@@ -372,30 +372,30 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
  {
      if( !(cpu&X264_CPU_MMX) )
          return;
-    pf[I_PRED_16x16_V]       = predict_16x16_v_mmx;
+    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_mmx;
      if( !(cpu&X264_CPU_MMXEXT) )
          return;
-    pf[I_PRED_16x16_DC]      = predict_16x16_dc_mmxext;
-    pf[I_PRED_16x16_DC_TOP]  = predict_16x16_dc_top_mmxext;
-    pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_mmxext;
+    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_mmxext;
+    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_mmxext;
+    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmxext;
  #ifndef ARCH_X86_64
-    pf[I_PRED_16x16_P]       = predict_16x16_p_mmxext;
+    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_mmxext;
  #endif
-    pf[I_PRED_16x16_H]       = predict_16x16_h_mmxext;
+    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_mmxext;
      if( !(cpu&X264_CPU_SSE2) )
          return;
-    pf[I_PRED_16x16_DC]     = predict_16x16_dc_sse2;
-    pf[I_PRED_16x16_V]      = predict_16x16_v_sse2;
+    pf[I_PRED_16x16_DC]      = x264_predict_16x16_dc_sse2;
+    pf[I_PRED_16x16_V]       = x264_predict_16x16_v_sse2;
      if( cpu&X264_CPU_SSE2_IS_SLOW )
          return;
-    pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
-    pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_sse2;
-    pf[I_PRED_16x16_P]      = predict_16x16_p_sse2;
+    pf[I_PRED_16x16_DC_TOP]  = x264_predict_16x16_dc_top_sse2;
+    pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
+    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
      if( !(cpu&X264_CPU_SSSE3) )
          return;
-    pf[I_PRED_16x16_H]      = predict_16x16_h_ssse3;
+    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_ssse3;
  #ifdef __GNUC__
-    pf[I_PRED_16x16_P]      = predict_16x16_p_ssse3;
+    pf[I_PRED_16x16_P]       = x264_predict_16x16_p_ssse3;
  #endif
  }
  
@@ -404,25 +404,25 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
      if( !(cpu&X264_CPU_MMX) )
          return;
  #ifdef ARCH_X86_64
-    pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
+    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
  #endif
-    pf[I_PRED_CHROMA_V]       = predict_8x8c_v_mmx;
+    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_mmx;
      if( !(cpu&X264_CPU_MMXEXT) )
          return;
-    pf[I_PRED_CHROMA_DC_TOP]  = predict_8x8c_dc_top_mmxext;
-    pf[I_PRED_CHROMA_H]       = predict_8x8c_h_mmxext;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_mmxext;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_mmxext;
  #ifndef ARCH_X86_64
-    pf[I_PRED_CHROMA_P]       = predict_8x8c_p_mmxext;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_mmxext;
  #endif
-    pf[I_PRED_CHROMA_DC]      = predict_8x8c_dc_mmxext;
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_mmxext;
      if( !(cpu&X264_CPU_SSE2) )
          return;
-    pf[I_PRED_CHROMA_P]       = predict_8x8c_p_sse2;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_sse2;
      if( !(cpu&X264_CPU_SSSE3) )
          return;
-    pf[I_PRED_CHROMA_H]       = predict_8x8c_h_ssse3;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_ssse3;
  #ifdef __GNUC__
-    pf[I_PRED_CHROMA_P]       = predict_8x8c_p_ssse3;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_ssse3;
  #endif
  }
  
@@ -430,48 +430,48 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
  {
      if( !(cpu&X264_CPU_MMXEXT) )
          return;
-    pf[I_PRED_8x8_V]   = predict_8x8_v_mmxext;
-    pf[I_PRED_8x8_H]   = predict_8x8_h_mmxext;
-    pf[I_PRED_8x8_DC]  = predict_8x8_dc_mmxext;
-    pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
-    pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
-    pf[I_PRED_8x8_HD]   = predict_8x8_hd_mmxext;
-    *predict_8x8_filter = predict_8x8_filter_mmxext;
+    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmxext;
+    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmxext;
+    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_mmxext;
+    pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmxext;
+    pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmxext;
+    pf[I_PRED_8x8_HD]     = x264_predict_8x8_hd_mmxext;
+    *predict_8x8_filter   = x264_predict_8x8_filter_mmxext;
  #ifdef ARCH_X86
-    pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
-    pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
-    pf[I_PRED_8x8_VR]  = predict_8x8_vr_mmxext;
-    pf[I_PRED_8x8_HU]   = predict_8x8_hu_mmxext;
+    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_mmxext;
+    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_mmxext;
+    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_mmxext;
+    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_mmxext;
  #endif
      if( !(cpu&X264_CPU_SSE2) )
          return;
-    pf[I_PRED_8x8_DDL] = predict_8x8_ddl_sse2;
-    pf[I_PRED_8x8_VL]  = predict_8x8_vl_sse2;
-    pf[I_PRED_8x8_VR]  = predict_8x8_vr_sse2;
-    pf[I_PRED_8x8_DDR] = predict_8x8_ddr_sse2;
-    pf[I_PRED_8x8_HD]   = predict_8x8_hd_sse2;
-    pf[I_PRED_8x8_HU]   = predict_8x8_hu_sse2;
+    pf[I_PRED_8x8_DDL]  = x264_predict_8x8_ddl_sse2;
+    pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_sse2;
+    pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_sse2;
+    pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_sse2;
+    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_sse2;
+    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_sse2;
      if( !(cpu&X264_CPU_SSSE3) )
          return;
-    pf[I_PRED_8x8_HD]   = predict_8x8_hd_ssse3;
-    pf[I_PRED_8x8_HU]   = predict_8x8_hu_ssse3;
-    *predict_8x8_filter = predict_8x8_filter_ssse3;
+    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_ssse3;
+    pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
+    *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
  }
  
  void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  {
      if( !(cpu&X264_CPU_MMXEXT) )
          return;
-    pf[I_PRED_4x4_VR]  = predict_4x4_vr_mmxext;
-    pf[I_PRED_4x4_DDL] = predict_4x4_ddl_mmxext;
-    pf[I_PRED_4x4_VL]  = predict_4x4_vl_mmxext;
-    pf[I_PRED_4x4_DC]  = predict_4x4_dc_mmxext;
-    pf[I_PRED_4x4_DDR] = predict_4x4_ddr_mmxext;
-    pf[I_PRED_4x4_HD]  = predict_4x4_hd_mmxext;
-    pf[I_PRED_4x4_HU]  = predict_4x4_hu_mmxext;
+    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
+    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
+    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmxext;
+    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_mmxext;
+    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
+    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_mmxext;
+    pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_mmxext;
      if( !(cpu&X264_CPU_SSSE3) )
          return;
-    pf[I_PRED_4x4_DDR] = predict_4x4_ddr_ssse3;
-    pf[I_PRED_4x4_VR]  = predict_4x4_vr_ssse3;
-    pf[I_PRED_4x4_HD]  = predict_4x4_hd_ssse3;
+    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
+    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
+    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
  }
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index 85c632a96e965c9fd1272603c97628103b0a5f7d..f9f78978530773dcd8ce9269828fa5516abb483e 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -26,10 +26,6 @@
  %include "x86util.asm"
  
  SECTION_RODATA
-pb_1:     times 16 db 1
-pw_1:     times 8 dw 1
-pd_1:     times 4 dd 1
-pb_01:    times 8 db 0, 1
  
  %macro DQM4 3
      dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -71,6 +67,11 @@ decimate_mask_table4:
  
  SECTION .text
  
+cextern pb_1
+cextern pw_1
+cextern pd_1
+cextern pb_01
+
  %macro QUANT_DC_START_MMX 0
      movd       m6, r1m     ; mf
      movd       m7, r2m     ; bias
@@ -183,7 +184,7 @@ SECTION .text
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
+; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
  ;-----------------------------------------------------------------------------
  %macro QUANT_DC 2-3 0
  cglobal %1, 1,1,%3
@@ -202,7 +203,7 @@ cglobal %1, 1,1,%3
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  ;-----------------------------------------------------------------------------
  %macro QUANT_AC 2
  cglobal %1, 3,3
@@ -220,33 +221,33 @@ INIT_MMX
  %define PABSW PABSW_MMX
  %define PSIGNW PSIGNW_MMX
  %define QUANT_DC_START QUANT_DC_START_MMX
-QUANT_DC x264_quant_2x2_dc_mmxext, 1
+QUANT_DC quant_2x2_dc_mmxext, 1
  %ifndef ARCH_X86_64 ; not needed because sse2 is faster
-QUANT_DC x264_quant_4x4_dc_mmxext, 4
-QUANT_AC x264_quant_4x4_mmx, 4
-QUANT_AC x264_quant_8x8_mmx, 16
+QUANT_DC quant_4x4_dc_mmxext, 4
+QUANT_AC quant_4x4_mmx, 4
+QUANT_AC quant_8x8_mmx, 16
  %endif
  
  INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse2, 2, 8
-QUANT_AC x264_quant_4x4_sse2, 2
-QUANT_AC x264_quant_8x8_sse2, 8
+QUANT_DC quant_4x4_dc_sse2, 2, 8
+QUANT_AC quant_4x4_sse2, 2
+QUANT_AC quant_8x8_sse2, 8
  
  %define PABSW PABSW_SSSE3
  %define PSIGNW PSIGNW_SSSE3
-QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8
-QUANT_AC x264_quant_4x4_ssse3, 2
-QUANT_AC x264_quant_8x8_ssse3, 8
+QUANT_DC quant_4x4_dc_ssse3, 2, 8
+QUANT_AC quant_4x4_ssse3, 2
+QUANT_AC quant_8x8_ssse3, 8
  
  INIT_MMX
-QUANT_DC x264_quant_2x2_dc_ssse3, 1
+QUANT_DC quant_2x2_dc_ssse3, 1
  %define QUANT_END QUANT_END_SSE4
  ;Not faster on Conroe, so only used in SSE4 versions
  %define QUANT_DC_START QUANT_DC_START_SSSE3
  INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse4, 2, 8
-QUANT_AC x264_quant_4x4_sse4, 2
-QUANT_AC x264_quant_8x8_sse4, 8
+QUANT_DC quant_4x4_dc_sse4, 2, 8
+QUANT_AC quant_4x4_sse4, 2
+QUANT_AC quant_8x8_sse4, 8
  
  
  
@@ -347,10 +348,10 @@ QUANT_AC x264_quant_8x8_sse4, 8
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
  ;-----------------------------------------------------------------------------
  %macro DEQUANT 4
-cglobal x264_dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3
  .skip_prologue:
      DEQUANT_START %3+2, %3
  
@@ -367,11 +368,11 @@ cglobal x264_dequant_%2x%2_%1, 0,3
      psrld m3, 1
      DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
  
-cglobal x264_dequant_%2x%2_flat16_%1, 0,3
+cglobal dequant_%2x%2_flat16_%1, 0,3
      movifnidn t2d, r2m
  %if %2 == 8
      cmp  t2d, 12
-    jl x264_dequant_%2x%2_%1.skip_prologue
+    jl dequant_%2x%2_%1.skip_prologue
      sub  t2d, 12
  %endif
      imul t0d, t2d, 0x2b
@@ -418,7 +419,7 @@ DEQUANT sse2, 4, 4, 2
  DEQUANT sse2, 8, 6, 2
  
  %macro DEQUANT_DC 1
-cglobal x264_dequant_4x4dc_%1, 0,3
+cglobal dequant_4x4dc_%1, 0,3
      DEQUANT_START 6, 6
  
  .lshift:
@@ -480,10 +481,10 @@ INIT_XMM
  DEQUANT_DC sse2
  
  ;-----------------------------------------------------------------------------
-; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
  ;-----------------------------------------------------------------------------
  %macro DENOISE_DCT 1-2 0
-cglobal x264_denoise_dct_%1, 4,5,%2
+cglobal denoise_dct_%1, 4,5,%2
      movzx     r4d, word [r0] ; backup DC coefficient
      pxor      m6, m6
  .loop:
@@ -534,7 +535,7 @@ DENOISE_DCT ssse3, 7
  
  
  ;-----------------------------------------------------------------------------
-; int x264_decimate_score( int16_t *dct )
+; int decimate_score( int16_t *dct )
  ;-----------------------------------------------------------------------------
  
  %macro DECIMATE_MASK_SSE2 6
@@ -579,21 +580,21 @@ DENOISE_DCT ssse3, 7
      or         %2, %6
  %endmacro
  
-cextern x264_decimate_table4
-cextern x264_decimate_table8
+cextern decimate_table4
+cextern decimate_table8
  
  %macro DECIMATE4x4 2
  
  ;A LUT is faster than bsf on AMD processors, and no slower on Intel
  ;This is not true for score64.
-cglobal x264_decimate_score%1_%2, 1,3
+cglobal decimate_score%1_%2, 1,3
  %ifdef PIC
-    lea r10, [x264_decimate_table4]
+    lea r10, [decimate_table4]
      lea r11, [decimate_mask_table4]
      %define table r10
      %define mask_table r11
  %else
-    %define table x264_decimate_table4
+    %define table decimate_table4
      %define mask_table decimate_mask_table4
  %endif
      DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
@@ -638,12 +639,12 @@ DECIMATE4x4 16, ssse3
  %macro DECIMATE8x8 1
  
  %ifdef ARCH_X86_64
-cglobal x264_decimate_score64_%1, 1,4
+cglobal decimate_score64_%1, 1,4
  %ifdef PIC
-    lea r10, [x264_decimate_table8]
+    lea r10, [decimate_table8]
      %define table r10
  %else
-    %define table x264_decimate_table8
+    %define table decimate_table8
  %endif
      mova  m5, [pb_1]
      DECIMATE_MASK r1d, eax, r0, m5, %1, null
@@ -677,9 +678,9 @@ cglobal x264_decimate_score64_%1, 1,4
  
  %else ; ARCH
  %ifidn %1, mmxext
-cglobal x264_decimate_score64_%1, 1,6
+cglobal decimate_score64_%1, 1,6
  %else
-cglobal x264_decimate_score64_%1, 1,5
+cglobal decimate_score64_%1, 1,5
  %endif
      mova  m7, [pb_1]
      DECIMATE_MASK r3, r2, r0, m7, %1, r5
@@ -705,7 +706,7 @@ cglobal x264_decimate_score64_%1, 1,5
      je   .largerun
      shrd  r3, r4, cl
      shr   r4, cl
-    add   r0b, byte [x264_decimate_table8 + ecx]
+    add   r0b, byte [decimate_table8 + ecx]
      shrd  r3, r4, 1
      shr   r4, 1
      cmp   r0, 6     ;score64's threshold is never higher than 6
@@ -746,7 +747,7 @@ DECIMATE8x8 sse2
  DECIMATE8x8 ssse3
  
  ;-----------------------------------------------------------------------------
-; int x264_coeff_last( int16_t *dct )
+; int coeff_last( int16_t *dct )
  ;-----------------------------------------------------------------------------
  
  %macro LAST_MASK_SSE2 2-3
@@ -780,12 +781,12 @@ DECIMATE8x8 ssse3
  
  %macro COEFF_LAST4 1
  %ifdef ARCH_X86_64
-cglobal x264_coeff_last4_%1, 1,1
+cglobal coeff_last4_%1, 1,1
      LAST rax, [r0], 0x3f
      shr eax, 4
      RET
  %else
-cglobal x264_coeff_last4_%1, 0,3
+cglobal coeff_last4_%1, 0,3
      mov   edx, r0mp
      mov   eax, [edx+4]
      xor   ecx, ecx
@@ -805,7 +806,7 @@ COEFF_LAST4 mmxext
  COEFF_LAST4 mmxext_lzcnt
  
  %macro COEFF_LAST 1
-cglobal x264_coeff_last15_%1, 1,3
+cglobal coeff_last15_%1, 1,3
      pxor m2, m2
      LAST_MASK r1d, r0-2, r2d
      xor r1d, 0xffff
@@ -813,7 +814,7 @@ cglobal x264_coeff_last15_%1, 1,3
      dec eax
      RET
  
-cglobal x264_coeff_last16_%1, 1,3
+cglobal coeff_last16_%1, 1,3
      pxor m2, m2
      LAST_MASK r1d, r0, r2d
      xor r1d, 0xffff
@@ -821,7 +822,7 @@ cglobal x264_coeff_last16_%1, 1,3
      RET
  
  %ifndef ARCH_X86_64
-cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
+cglobal coeff_last64_%1, 1, 5-mmsize/16
      pxor m2, m2
      LAST_MASK r2d, r0+64, r4d
      LAST_MASK r3d, r0+96, r4d
@@ -841,7 +842,7 @@ cglobal x264_coeff_last64_%1, 1, 5-mmsize/16
      add eax, 32
      RET
  %else
-cglobal x264_coeff_last64_%1, 1,4
+cglobal coeff_last64_%1, 1,4
      pxor m2, m2
      LAST_MASK_SSE2 r1d, r0
      LAST_MASK_SSE2 r2d, r0+32
@@ -872,7 +873,7 @@ COEFF_LAST sse2
  COEFF_LAST sse2_lzcnt
  
  ;-----------------------------------------------------------------------------
-; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
+; int coeff_level_run( int16_t *dct, run_level_t *runlevel )
  ;-----------------------------------------------------------------------------
  
  %macro LAST_MASK4_MMX 2-3
@@ -901,7 +902,7 @@ COEFF_LAST sse2_lzcnt
  %endif
  
  %macro COEFF_LEVELRUN 2
-cglobal x264_coeff_level_run%2_%1,0,7
+cglobal coeff_level_run%2_%1,0,7
      movifnidn t0, r0mp
      movifnidn t1, r1mp
      pxor    m2, m2
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index 59592f9a1cde8476fbf331b471350cdb53ba4844..6d63e13ce5fdc810b5a623fecbf37c67e5cd90f2 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -26,14 +26,14 @@
  %include "x86inc.asm"
  %include "x86util.asm"
  
-SECTION_RODATA
-pb_3: times 16 db 3
-pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
-pw_8: times 4 dw 8
-sw_64: dd 64
-
  SECTION .text
  
+cextern pb_3
+cextern pb_shuf8x8c
+cextern pw_8
+cextern sw_6
+cextern sw_64
+
  ;=============================================================================
  ; SAD MMX
  ;=============================================================================
@@ -78,10 +78,10 @@ SECTION .text
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
  %macro SAD 2
-cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
+cglobal pixel_sad_%1x%2_mmxext, 4,4
      pxor    mm0, mm0
  %rep %2/2
      SAD_INC_2x%1P
@@ -113,9 +113,9 @@ SAD  4,  4
  
  %macro SAD_W16 1
  ;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1, 4,4,8
+cglobal pixel_sad_16x16_%1, 4,4,8
      movdqu  m0, [r2]
      movdqu  m1, [r2+r3]
      lea     r2, [r2+2*r3]
@@ -180,9 +180,9 @@ cglobal x264_pixel_sad_16x16_%1, 4,4,8
      SAD_END_SSE2
  
  ;-----------------------------------------------------------------------------
-; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
  ;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_%1, 4,4
+cglobal pixel_sad_16x8_%1, 4,4
      movdqu  m0, [r2]
      movdqu  m2, [r2+r3]
      lea     r2, [r2+2*r3]
@@ -249,7 +249,7 @@ SAD_W16 sse2_aligned
  %endmacro
  
  ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
-cglobal x264_pixel_sad_8x16_sse2, 4,4
+cglobal pixel_sad_8x16_sse2, 4,4
      SAD_INC_4x8P_SSE 0
      SAD_INC_4x8P_SSE 1
      SAD_INC_4x8P_SSE 1
@@ -258,10 +258,10 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4
      RET
  
  ;-----------------------------------------------------------------------------
-; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
  ;-----------------------------------------------------------------------------
  
-cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
+cglobal intra_sad_x3_4x4_mmxext, 3,3
      pxor      mm7, mm7
      movd      mm0, [r1-FDEC_STRIDE]
      movd      mm1, [r0+FENC_STRIDE*0]
@@ -305,7 +305,7 @@ cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
      RET
  
  ;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8 ( uint8_t *fenc, uint8_t edge[33], int res[3]);
+; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
  ;-----------------------------------------------------------------------------
  
  ;m0 = DC
@@ -343,7 +343,7 @@ cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
  %endmacro
  
  INIT_MMX
-cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
+cglobal intra_sad_x3_8x8_mmxext, 3,3
      movq      m7, [r1+7]
      pxor      m0, m0
      movq      m6, [r1+16]  ;V prediction
@@ -372,7 +372,7 @@ cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
      RET
  
  ;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
  ;-----------------------------------------------------------------------------
  
  %macro INTRA_SAD_HV_ITER 2
@@ -407,7 +407,7 @@ cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
  %endmacro
  
  %macro INTRA_SAD_8x8C 1
-cglobal x264_intra_sad_x3_8x8c_%1, 3,3
+cglobal intra_sad_x3_8x8c_%1, 3,3
      movq        m6, [r1 - FDEC_STRIDE]
      add         r1, FDEC_STRIDE*4
  %ifidn %1,ssse3
@@ -508,13 +508,13 @@ INTRA_SAD_8x8C ssse3
  
  
  ;-----------------------------------------------------------------------------
-; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
  ;-----------------------------------------------------------------------------
  
  ;xmm7: DC prediction    xmm6: H prediction  xmm5: V prediction
  ;xmm4: DC pred score    xmm3: H pred score  xmm2: V pred score
  %macro INTRA_SAD16 1-2 0
-cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
+cglobal intra_sad_x3_16x16_%1,3,5,%2
      pxor    mm0, mm0
      pxor    mm1, mm1
      psadbw  mm0, [r1-FDEC_STRIDE+0]
@@ -817,11 +817,11 @@ INTRA_SAD16 ssse3, 8
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-;                                      uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+;                          uint8_t *pix2, int i_stride, int scores[3] )
  ;-----------------------------------------------------------------------------
  %macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
+cglobal pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
  %ifdef WIN64
      %assign i %1+1
      movsxd r %+ i, r %+ i %+ d
@@ -1166,11 +1166,11 @@ SAD_X 4,  4,  4
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-;                                    uint8_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+;                          uint8_t *pix2, int i_stride, int scores[3] )
  ;-----------------------------------------------------------------------------
  %macro SAD_X_SSE2 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
+cglobal pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
  %ifdef WIN64
      %assign i %1+1
      movsxd r %+ i, r %+ i %+ d
@@ -1183,7 +1183,7 @@ cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
  %endmacro
  
  %macro SAD_X_SSE2_MISALIGN 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
+cglobal pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
  %ifdef WIN64
      %assign i %1+1
      movsxd r %+ i, r %+ i %+ d
@@ -1289,11 +1289,11 @@ sad_w16_align%1_ssse3:
  %endmacro
  
  %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal x264_pixel_sad_16x%2_cache64_%1
+cglobal pixel_sad_16x%2_cache64_%1
      mov     eax, r2m
      and     eax, 0x37
      cmp     eax, 0x30
-    jle x264_pixel_sad_16x%2_sse2
+    jle pixel_sad_16x%2_sse2
      PROLOGUE 4,6
      mov     r4d, r2d
      and     r4d, 15
@@ -1324,7 +1324,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
      mov    eax, r2m
      and    eax, 0x17|%1|(%4>>1)
      cmp    eax, 0x10|%1|(%4>>1)
-    jle x264_pixel_sad_%1x%2_mmxext
+    jle pixel_sad_%1x%2_mmxext
      and    eax, 7
      shl    eax, 3
      movd   mm6, [sw_64]
@@ -1337,7 +1337,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
  %endmacro
  
  %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_16x%1_cache%2_mmxext
+cglobal pixel_sad_16x%1_cache%2_mmxext
      SAD_CACHELINE_START_MMX2 16, %1, %1, %2
  .loop:
      movq   mm1, [r2]
@@ -1363,7 +1363,7 @@ cglobal x264_pixel_sad_16x%1_cache%2_mmxext
  %endmacro
  
  %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_8x%1_cache%2_mmxext
+cglobal pixel_sad_8x%1_cache%2_mmxext
      SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
  .loop:
      movq   mm1, [r2+8]
@@ -1399,11 +1399,11 @@ cglobal x264_pixel_sad_8x%1_cache%2_mmxext
  %endmacro
  
  %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
+cglobal pixel_sad_x3_%1x%2_cache%3_%6
      CHECK_SPLIT r1m, %1, %3
      CHECK_SPLIT r2m, %1, %3
      CHECK_SPLIT r3m, %1, %3
-    jmp x264_pixel_sad_x3_%1x%2_%4
+    jmp pixel_sad_x3_%1x%2_%4
  .split:
  %ifdef ARCH_X86_64
      PROLOGUE 6,7
@@ -1418,7 +1418,7 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
      mov  r3, r4
      mov  r10, r0
      mov  r11, r5
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11], eax
  %ifdef WIN64
      mov  r2, [rsp]
@@ -1426,7 +1426,7 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
      pop  r2
  %endif
      mov  r0, r10
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11+4], eax
  %ifdef WIN64
      mov  r2, [rsp+8]
@@ -1434,7 +1434,7 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
      pop  r2
  %endif
      mov  r0, r10
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11+8], eax
  %ifdef WIN64
      add  rsp, 24
@@ -1447,15 +1447,15 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
      push dword [esp+16]
      push dword 16
      push dword [esp+20]
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  ecx, [esp+32]
      mov  [edi], eax
      mov  [esp+8], ecx
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  ecx, [esp+36]
      mov  [edi+4], eax
      mov  [esp+8], ecx
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [edi+8], eax
      add  esp, 16
      pop  edi
@@ -1464,12 +1464,12 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
  %endmacro
  
  %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
+cglobal pixel_sad_x4_%1x%2_cache%3_%6
      CHECK_SPLIT r1m, %1, %3
      CHECK_SPLIT r2m, %1, %3
      CHECK_SPLIT r3m, %1, %3
      CHECK_SPLIT r4m, %1, %3
-    jmp x264_pixel_sad_x4_%1x%2_%4
+    jmp pixel_sad_x4_%1x%2_%4
  .split:
  %ifdef ARCH_X86_64
      PROLOGUE 6,7
@@ -1484,7 +1484,7 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
      mov  r1, FENC_STRIDE
      mov  r3, r5
      mov  r10, r0
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11], eax
  %ifdef WIN64
      mov  r2, [rsp]
@@ -1492,7 +1492,7 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
      pop  r2
  %endif
      mov  r0, r10
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11+4], eax
  %ifdef WIN64
      mov  r2, [rsp+8]
@@ -1500,7 +1500,7 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
      pop  r2
  %endif
      mov  r0, r10
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11+8], eax
  %ifdef WIN64
      mov  r2, [rsp+16]
@@ -1508,7 +1508,7 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
      pop  r2
  %endif
      mov  r0, r10
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [r11+12], eax
  %ifdef WIN64
      add  rsp, 24
@@ -1521,19 +1521,19 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
      push dword [esp+16]
      push dword 16
      push dword [esp+20]
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  ecx, [esp+32]
      mov  [edi], eax
      mov  [esp+8], ecx
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  ecx, [esp+36]
      mov  [edi+4], eax
      mov  [esp+8], ecx
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  ecx, [esp+40]
      mov  [edi+8], eax
      mov  [esp+8], ecx
-    call x264_pixel_sad_%1x%2_cache%3_%5
+    call pixel_sad_%1x%2_cache%3_%5
      mov  [edi+12], eax
      add  esp, 16
      pop  edi
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index ee3eca9ccd74b6879351f73f7a5fb2685a3b260b..9d23640a931e5c853315c18ef6cdb4023bc1c3dc 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -32,6 +32,8 @@
  ; as this feature might be useful for others as well.  Send patches or ideas
  ; to x264-devel@videolan.org .
  
+%define program_name x264
+
  %ifdef ARCH_X86_64
      %ifidn __OUTPUT_FORMAT__,win32
          %define WIN64
@@ -436,7 +438,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
  
  ; Symbol prefix for C linkage
  %macro cglobal 1-2+
-    %xdefine %1 mangle(%1)
+    %xdefine %1 mangle(program_name %+ _ %+ %1)
      %xdefine %1.skip_prologue %1 %+ .skip_prologue
      %ifidn __OUTPUT_FORMAT__,elf
          global %1:function hidden
@@ -453,10 +455,22 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
  %endmacro
  
  %macro cextern 1
+    %xdefine %1 mangle(program_name %+ _ %+ %1)
+    extern %1
+%endmacro
+
+;like cextern, but without the prefix
+%macro cextern_naked 1
      %xdefine %1 mangle(%1)
      extern %1
  %endmacro
  
+%macro const 2+
+    %xdefine %1 mangle(program_name %+ _ %+ %1)
+    global %1
+    %1: %2
+%endmacro
+
  ; This is needed for ELF, otherwise the GNU linker assumes the stack is
  ; executable by default.
  %ifidn __OUTPUT_FORMAT__,elf
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 75f8dfeb36df39aa22c647bbcb8b6426aaeb0b98..85a70448a3d3fa7b258cc56596d1343831e10ba8 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -27,7 +27,6 @@
  #include <unistd.h>
  
  #include "common/common.h"
-#include "common/cpu.h"
  #include "macroblock.h"
  #include "me.h"
  #include "ratecontrol.h"
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 79f655462786669db9ffec43e41b49a57225e0f2..87f41446595b4da9c5ef3945bd9444a09ab52d61 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -25,7 +25,6 @@
  #include <math.h>
  
  #include "common/common.h"
-#include "common/cpu.h"
  
  #include "set.h"
  #include "analyse.h"
diff --git a/encoder/lookahead.c b/encoder/lookahead.c

index 5e29fb5bf32a47f23eecd700b5a8c2b475db61b8..942e9526453650cd6f9e2d13a44d683340868732 100644 (file)
--- a/encoder/lookahead.c
+++ b/encoder/lookahead.c
@@ -35,7 +35,6 @@
   * # of bframes + # of threads.
   */
  #include "common/common.h"
-#include "common/cpu.h"
  #include "analyse.h"
  
  static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index 93e205abb077148a508506a1b40c9a5d821cf6b8..0ebde2631ab3187daafd3a563401bdeb9a60fccb 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -29,7 +29,6 @@
  #include <math.h>
  
  #include "common/common.h"
-#include "common/cpu.h"
  #include "ratecontrol.h"
  #include "me.h"
  
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index 2fee74aa87d62e6cd9951497e2a5b4d01e8c7fa6..559e3f4eafe3900e3d5232e309a32d7caa17db3a 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -25,7 +25,6 @@
  #include <math.h>
  
  #include "common/common.h"
-#include "common/cpu.h"
  #include "macroblock.h"
  #include "me.h"
  
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm

index 1970cb966fcbffb44fc32e8e555cee1e5d9b3102..51f8e78baa577467e874b2419a0becd123456198 100644 (file)
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -43,7 +43,7 @@ x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
  
  SECTION .text
  
-cextern puts
+cextern_naked puts
  
  ; max number of args used by any x264 asm function.
  ; (max_args % 4) must equal 3 for stack alignment
@@ -54,7 +54,7 @@ cextern puts
  ;-----------------------------------------------------------------------------
  ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
  ;-----------------------------------------------------------------------------
-cglobal x264_checkasm_call, 4,7,16
+cglobal checkasm_call, 4,7,16
      sub  rsp, max_args*8
      %assign stack_offset stack_offset+max_args*8
      mov  r6, r0
@@ -113,7 +113,7 @@ cglobal x264_checkasm_call, 4,7,16
  ;-----------------------------------------------------------------------------
  ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
  ;-----------------------------------------------------------------------------
-cglobal x264_checkasm_call, 1,7
+cglobal checkasm_call, 1,7
      mov  r3, n3
      mov  r4, n4
      mov  r5, n5
@@ -147,7 +147,7 @@ cglobal x264_checkasm_call, 1,7
  ;-----------------------------------------------------------------------------
  ; int x264_stack_pagealign( int (*func)(), int align )
  ;-----------------------------------------------------------------------------
-cglobal x264_stack_pagealign, 2,2
+cglobal stack_pagealign, 2,2
      push rbp
      mov  rbp, rsp
      and  rsp, ~0xfff
author	Fiona Glaser <fiona@x264.com>
	Fri, 30 Apr 2010 18:36:19 +0000 (11:36 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Thu, 6 May 2010 05:08:14 +0000 (22:08 -0700)
Makefile		patch \| blob \| history
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/cpu.c		patch \| blob \| history
common/cpu.h		patch \| blob \| history
common/x86/cabac-a.asm		patch \| blob \| history
common/x86/const-a.asm	[new file with mode: 0755]	patch \| blob
common/x86/cpu-a.asm		patch \| blob \| history
common/x86/dct-32.asm		patch \| blob \| history
common/x86/dct-64.asm		patch \| blob \| history
common/x86/dct-a.asm		patch \| blob \| history
common/x86/deblock-a.asm		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/pixel-32.asm		patch \| blob \| history
common/x86/pixel-a.asm		patch \| blob \| history
common/x86/predict-a.asm		patch \| blob \| history
common/x86/predict-c.c		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/lookahead.c		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
tools/checkasm-a.asm		patch \| blob \| history