x86: AVX memzero_aligned

author Henrik Gramner <henrik@gramner.com>

Tue, 16 Apr 2013 21:27:29 +0000 (23:27 +0200)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:32 +0000 (14:36 -0700)
author Henrik Gramner <henrik@gramner.com>
Tue, 16 Apr 2013 21:27:29 +0000 (23:27 +0200)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:32 +0000 (14:36 -0700)
diff --git a/common/common.h b/common/common.h

index 53a6ff03c0b834f592867fbcb348e3a8e8a89649..1732d59b698a90509706412f9b86dd8f2d69ac16 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -770,8 +770,8 @@ struct x264_t
              ALIGNED_16( dctcoef fenc_dct4[16][16] );
  
              /* Psy RD SATD/SA8D scores cache */
-            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
-            ALIGNED_16( uint32_t fenc_satd_cache[32] );
+            ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
+            ALIGNED_N( uint32_t fenc_satd_cache[32] );
  
              /* pointer over mb of the frame to be compressed */
              pixel *p_fenc[3]; /* y,u,v */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index caa9396913686256242e1805f5d9e77087e338cb..27e66b869d9b7f7afcff53ecd97c5b7458bd5f76 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1223,7 +1223,7 @@ MEMCPY
  ;-----------------------------------------------------------------------------
  ; void *memzero_aligned( void *dst, size_t n );
  ;-----------------------------------------------------------------------------
-%macro MEMZERO 0
+%macro MEMZERO 1
  cglobal memzero_aligned, 2,2
      add  r0, r1
      neg  r1
@@ -1234,21 +1234,21 @@ cglobal memzero_aligned, 2,2
  %endif
  .loop:
  %assign i 0
-%rep 8
+%rep %1
      mova [r0 + r1 + i], m0
  %assign i i+mmsize
  %endrep
-    add r1, mmsize*8
+    add r1, mmsize*%1
      jl .loop
      RET
  %endmacro
  
  INIT_MMX mmx
-MEMZERO
+MEMZERO 8
  INIT_XMM sse
-MEMZERO
-
-
+MEMZERO 8
+INIT_YMM avx
+MEMZERO 4
  
  %if HIGH_BIT_DEPTH == 0
  ;-----------------------------------------------------------------------------
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 3bb38838e4cd21a1c42e10b6915f0c1715300f64..198d7e402bc812d79b080cc04304de33e6b92d2b 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -129,6 +129,7 @@ void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
  void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
  void x264_memzero_aligned_mmx( void *dst, size_t n );
  void x264_memzero_aligned_sse( void *dst, size_t n );
+void x264_memzero_aligned_avx( void *dst, size_t n );
  void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
  void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
  void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
@@ -798,6 +799,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  
      if( !(cpu&X264_CPU_AVX) )
          return;
+    pf->memzero_aligned = x264_memzero_aligned_avx;
      pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
  
      if( cpu&X264_CPU_FMA4 )
diff --git a/encoder/me.c b/encoder/me.c

index 55896025e59e7a67c17e758d0892238487fed81f..8238b96bb09145eb4f5ed85981c1a3bacb1a4967 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1058,7 +1058,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
      uint64_t bcostrd = COST_MAX64;
      uint16_t amvd;
      /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
-    ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
+    ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
      /* all permutations of an offset in up to 2 of the dimensions */
      ALIGNED_4( static const int8_t dia4d[33][4] ) =
      {
author	Henrik Gramner <henrik@gramner.com>
	Tue, 16 Apr 2013 21:27:29 +0000 (23:27 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:32 +0000 (14:36 -0700)
common/common.h		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history