Enable some existing asm functions that were missing function pointers

author Loren Merritt <pengvado@akuvian.org>

Wed, 3 Aug 2011 14:58:50 +0000 (14:58 +0000)

committer Fiona Glaser <fiona@x264.com>

Wed, 10 Aug 2011 00:21:33 +0000 (17:21 -0700)
author Loren Merritt <pengvado@akuvian.org>
Wed, 3 Aug 2011 14:58:50 +0000 (14:58 +0000)
committer Fiona Glaser <fiona@x264.com>
Wed, 10 Aug 2011 00:21:33 +0000 (17:21 -0700)
diff --git a/common/pixel.c b/common/pixel.c

index 5052da68722b96bffd3df5ebe3012dc273de8775..bc8bf975cea88e86d65fba79c93c3d2ffceb046d 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -546,6 +546,7 @@ INTRA_MBCMP( sad, 16,  v, h, dc,  , _mmx2 )
  INTRA_MBCMP(satd, 16,  v, h, dc,  , _mmx2 )
  INTRA_MBCMP( sad,  8, dc, h,  v, c, _sse2 )
  INTRA_MBCMP( sad, 16,  v, h, dc,  , _sse2 )
+INTRA_MBCMP( sad,  4,  v, h, dc,  , _ssse3 )
  INTRA_MBCMP( sad,  8, dc, h,  v, c, _ssse3 )
  INTRA_MBCMP( sad, 16,  v, h, dc,  , _ssse3 )
  #endif
@@ -873,10 +874,35 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  
          pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
          pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_ssse3;
          pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_ssse3;
          pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
          pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
      }
+    if( cpu&X264_CPU_SSE4 )
+    {
+        if( !(cpu&X264_CPU_STACK_MOD4) )
+        {
+            INIT4( hadamard_ac, _sse4 );
+        }
+        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
+    }
+    if( cpu&X264_CPU_AVX )
+    {
+        INIT_ADS( _avx );
+        if( !(cpu&X264_CPU_STACK_MOD4) )
+        {
+            INIT4( hadamard_ac, _avx );
+        }
+        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx;
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
+        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
+        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
+        pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
+    }
  #endif // HAVE_MMX
  #else // !HIGH_BIT_DEPTH
  #if HAVE_MMX
@@ -1038,6 +1064,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
  #endif
          pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+            pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
          if( cpu&X264_CPU_CACHELINE_64 )
          {
              INIT2( sad, _cache64_ssse3 );
@@ -1062,8 +1090,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
          pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
          pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
-        /* Slower on Conroe, so only enable under SSE4 */
-        pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
      }
  
      if( cpu&X264_CPU_AVX )
@@ -1071,8 +1097,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          INIT7( satd, _avx );
          INIT7( satd_x3, _avx );
          INIT7( satd_x4, _avx );
-        pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
-        pixf->ads[PIXEL_16x8]  = x264_pixel_ads2_avx;
+        INIT_ADS( _avx );
          if( !(cpu&X264_CPU_STACK_MOD4) )
          {
              INIT4( hadamard_ac, _avx );
diff --git a/common/quant.c b/common/quant.c

index a97d6c38e11e728475faa747ef72e3300524d2aa..d939a33f55d99fc7704ad5da34d5e316915d1932 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -428,6 +428,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_4x4 = x264_quant_4x4_sse4;
          pf->quant_8x8 = x264_quant_8x8_sse4;
      }
+    if( cpu&X264_CPU_AVX )
+    {
+        pf->denoise_dct = x264_denoise_dct_avx;
+    }
  #endif // HAVE_MMX
  #else // !HIGH_BIT_DEPTH
  #if HAVE_MMX
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 4555526a78766bdef0b2b61c4dd8bbabed3b3307..1cfd999a045c8630e2d3371eb7d7453f798f9408 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1291,19 +1291,21 @@ MC_COPY 16
  ;=============================================================================
  ; prefetch
  ;=============================================================================
-; FIXME assumes 64 byte cachelines
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
  
  ;-----------------------------------------------------------------------------
-; void prefetch_fenc( uint8_t *pix_y, int stride_y,
-;                     uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, int stride_y,
+;                     pixel *pix_uv, int stride_uv, int mb_x )
  ;-----------------------------------------------------------------------------
  INIT_MMX
  %ifdef ARCH_X86_64
  cglobal prefetch_fenc_mmx2, 5,5
+    FIX_STRIDES r1d, r3d
      and    r4d, 3
      mov    eax, r4d
      imul   r4d, r1d
-    lea    r0,  [r0+r4*4+64]
+    lea    r0,  [r0+r4*4+64*SIZEOF_PIXEL]
      prefetcht0  [r0]
      prefetcht0  [r0+r1]
      lea    r0,  [r0+r1*2]
@@ -1311,7 +1313,7 @@ cglobal prefetch_fenc_mmx2, 5,5
      prefetcht0  [r0+r1]
  
      imul   eax, r3d
-    lea    r2,  [r2+rax*2+64]
+    lea    r2,  [r2+rax*2+64*SIZEOF_PIXEL]
      prefetcht0  [r2]
      prefetcht0  [r2+r3]
      RET
@@ -1321,9 +1323,10 @@ cglobal prefetch_fenc_mmx2, 0,3
      mov    r2, r4m
      mov    r1, r1m
      mov    r0, r0m
+    FIX_STRIDES r1
      and    r2, 3
      imul   r2, r1
-    lea    r0, [r0+r2*4+64]
+    lea    r0, [r0+r2*4+64*SIZEOF_PIXEL]
      prefetcht0 [r0]
      prefetcht0 [r0+r1]
      lea    r0, [r0+r1*2]
@@ -1333,21 +1336,23 @@ cglobal prefetch_fenc_mmx2, 0,3
      mov    r2, r4m
      mov    r1, r3m
      mov    r0, r2m
+    FIX_STRIDES r1
      and    r2, 3
      imul   r2, r1
-    lea    r0, [r0+r2*2+64]
+    lea    r0, [r0+r2*2+64*SIZEOF_PIXEL]
      prefetcht0 [r0]
      prefetcht0 [r0+r1]
      ret
  %endif ; ARCH_X86_64
  
  ;-----------------------------------------------------------------------------
-; void prefetch_ref( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, int stride, int parity )
  ;-----------------------------------------------------------------------------
  cglobal prefetch_ref_mmx2, 3,3
+    FIX_STRIDES r1d
      dec    r2d
      and    r2d, r1d
-    lea    r0,  [r0+r2*8+64]
+    lea    r0,  [r0+r2*8+64*SIZEOF_PIXEL]
      lea    r2,  [r1*3]
      prefetcht0  [r0]
      prefetcht0  [r0+r1]
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 3d018fcc554d453bbb98c678af07c92ad0cec90c..52e62d6ec2173c6a5ddde4a60bb09d2b6a58e60d 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -85,8 +85,8 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
  void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
  void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
  void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
-void x264_prefetch_ref_mmx2( uint8_t *, int, int );
+void x264_prefetch_fenc_mmx2( pixel *, int, pixel *, int, int );
+void x264_prefetch_ref_mmx2( pixel *, int, int );
  void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
  void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
  void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
@@ -225,7 +225,11 @@ static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int,
  };
  
  MC_COPY_WTAB(mmx,mmx,mmx,mmx)
+#if HIGH_BIT_DEPTH
+MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+#else
  MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#endif
  
  #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
      static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
@@ -510,6 +514,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_MMX2) )
          return;
  
+    pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
+    pf->prefetch_ref  = x264_prefetch_ref_mmx2;
+
      pf->plane_copy = x264_plane_copy_mmx2;
      pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
      pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
@@ -605,8 +612,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_STACK_MOD4) )
          pf->mc_chroma = x264_mc_chroma_avx;
  #else // !HIGH_BIT_DEPTH
-    pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
-    pf->prefetch_ref  = x264_prefetch_ref_mmx2;
  
  #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
      if( cpu&X264_CPU_CACHELINE_32 )
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c

index 36e8e95090cf99b5461d172ef733d4f47cef5c2a..abc5cdf4c285c29045b3f2b212aa2b26b748b74b 100644 (file)
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -566,6 +566,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
      pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_avx;
      pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_avx;
      pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_avx;
+    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_avx;
  #endif // HIGH_BIT_DEPTH
  }
author	Loren Merritt <pengvado@akuvian.org>
	Wed, 3 Aug 2011 14:58:50 +0000 (14:58 +0000)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 10 Aug 2011 00:21:33 +0000 (17:21 -0700)
common/pixel.c		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/predict-c.c		patch \| blob \| history