]> git.sesse.net Git - x264/blobdiff - common/pixel.c
Much faster weightp
[x264] / common / pixel.c
index 72b5e6fd3002dcedc171b93e314d92df58c963ec..7c6023711e94e18b37ff794df07addfee0de54af 100644 (file)
@@ -29,6 +29,9 @@
 #ifdef ARCH_PPC
 #   include "ppc/pixel.h"
 #endif
+#ifdef ARCH_ARM
+#   include "arm/pixel.h"
+#endif
 #ifdef ARCH_UltraSparc
 #   include "sparc/pixel.h"
 #endif
@@ -139,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
 /****************************************************************************
  * pixel_var_wxh
  ****************************************************************************/
-#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride ) \
+#define PIXEL_VAR_C( name, w ) \
+static uint64_t name( uint8_t *pix, int i_stride ) \
 {                                             \
-    uint32_t var = 0, sum = 0, sqr = 0;       \
+    uint32_t sum = 0, sqr = 0;                \
     int x, y;                                 \
     for( y = 0; y < w; y++ )                  \
     {                                         \
@@ -153,12 +156,35 @@ static int name( uint8_t *pix, int i_stride ) \
         }                                     \
         pix += i_stride;                      \
     }                                         \
-    var = sqr - (sum * sum >> shift);         \
-    return var;                               \
+    return sum + ((uint64_t)sqr << 32);       \
 }
 
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
+
+/****************************************************************************
+ * pixel_var2_wxh
+ ****************************************************************************/
+static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
+{
+    uint32_t var = 0, sum = 0, sqr = 0;
+    int x, y;
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            int diff = pix1[x] - pix2[x];
+            sum += diff;
+            sqr += diff * diff;
+        }
+        pix1 += i_stride1;
+        pix2 += i_stride2;
+    }
+    sum = abs(sum);
+    var = sqr - (sum * sum >> 6);
+    *ssd = sqr;
+    return var;
+}
 
 
 #define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
@@ -429,6 +455,10 @@ SATD_X_DECL7( _ssse3 )
 SATD_X_DECL7( _sse4 )
 #endif
 
+#ifdef HAVE_ARMV6
+SATD_X_DECL7( _neon )
+#endif
+
 /****************************************************************************
  * structural similarity metric
  ****************************************************************************/
@@ -611,6 +641,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
+    pixf->var2_8x8 = pixel_var2_8x8;
 
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
@@ -636,6 +667,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
+        pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
 
         if( cpu&X264_CPU_CACHELINE_32 )
         {
@@ -665,6 +697,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmxext;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmxext;
+        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmxext;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmxext;
     }
@@ -681,6 +714,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
 #endif
+        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
     }
 
     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -760,6 +794,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
 #endif
+        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
         if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
@@ -786,6 +821,47 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     }
 #endif //HAVE_MMX
 
+#ifdef HAVE_ARMV6
+    if( cpu&X264_CPU_ARMV6 )
+    {
+        pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
+        pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
+        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
+        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
+    }
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT5( sad, _neon );
+        INIT5( sad_aligned, _neon );
+        INIT7( sad_x3, _neon );
+        INIT7( sad_x4, _neon );
+        INIT7( ssd, _neon );
+        INIT7( satd, _neon );
+        INIT7( satd_x3, _neon );
+        INIT7( satd_x4, _neon );
+        INIT4( hadamard_ac, _neon );
+        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
+        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
+        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
+        pixf->var2_8x8          = x264_pixel_var2_8x8_neon;
+
+        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
+        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
+
+        if( cpu&X264_CPU_FAST_NEON_MRC )
+        {
+            pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
+            pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
+            pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
+            pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
+        }
+        else    // really just scheduled for dual issue / A8
+        {
+            INIT5( sad_aligned, _neon_dual );
+        }
+    }
+#endif
 #ifdef ARCH_PPC
     if( cpu&X264_CPU_ALTIVEC )
     {