faster ESA init

author Loren Merritt <pengvado@akuvian.org>

Thu, 11 Dec 2008 19:47:17 +0000 (19:47 +0000)

committer Loren Merritt <pengvado@akuvian.org>

Mon, 22 Dec 2008 00:20:08 +0000 (00:20 +0000)
author Loren Merritt <pengvado@akuvian.org>
Thu, 11 Dec 2008 19:47:17 +0000 (19:47 +0000)
committer Loren Merritt <pengvado@akuvian.org>
Mon, 22 Dec 2008 00:20:08 +0000 (00:20 +0000)
diff --git a/common/common.h b/common/common.h

index f2a0c54a9bc5f2c6b8659b693fdfd4dd3cb01eea..1668a6300626e7c5cefe53093e4dd315ad9e5428 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -338,6 +338,7 @@ struct x264_t
          int i_max_ref1;
          int i_delay;    /* Number of frames buffered for B reordering */
          int b_have_lowres;  /* Whether 1/2 resolution luma planes are being used */
+        int b_have_sub8x8_esa;
      } frames;
  
      /* current frame being encoded */
diff --git a/common/frame.c b/common/frame.c

index 482992d20dc4b59708c7241eba8c60cff6e500dd..021242f1446831b39b476e899602bc322bdd382c 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -99,7 +99,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
      if( h->param.analyse.i_me_method >= X264_ME_ESA )
      {
          CHECKED_MALLOC( frame->buffer[3],
-                        2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
+                        frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
          frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
      }
  
diff --git a/common/mc.c b/common/mc.c

index fe37c4704567c3b51721bf17a6650166fb9c8ca9..7422ba4e8930af8e10327307c3ef09f719d0622b 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -269,6 +269,42 @@ static void memzero_aligned( void * dst, int n )
      memset( dst, 0, n );
  }
  
+static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+{
+    int x, v = pix[0]+pix[1]+pix[2]+pix[3];
+    for( x=0; x<stride-4; x++ )
+    {
+        sum[x] = v + sum[x-stride];
+        v += pix[x+4] - pix[x];
+    }
+}
+
+static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride )
+{
+    int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
+    for( x=0; x<stride-8; x++ )
+    {
+        sum[x] = v + sum[x-stride];
+        v += pix[x+8] - pix[x];
+    }
+}
+
+static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+{
+    int x;
+    for( x=0; x<stride-8; x++ )
+        sum4[x] = sum8[x+4*stride] - sum8[x];
+    for( x=0; x<stride-8; x++ )
+        sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
+}
+
+static void integral_init8v( uint16_t *sum8, int stride )
+{
+    int x;
+    for( x=0; x<stride-8; x++ )
+        sum8[x] = sum8[x+8*stride] - sum8[x];
+}
+
  void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
  {
      uint8_t *src = frame->plane[0];
@@ -353,6 +389,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
      pf->memzero_aligned = memzero_aligned;
      pf->frame_init_lowres_core = frame_init_lowres_core;
  
+    pf->integral_init4h = integral_init4h;
+    pf->integral_init8h = integral_init8h;
+    pf->integral_init4v = integral_init4v;
+    pf->integral_init8v = integral_init8v;
+
  #ifdef HAVE_MMX
      x264_mc_init_mmx( cpu, pf );
  #endif
@@ -370,7 +411,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
      int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
      int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
      int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
-    int x, y;
+    int y;
  
      if( mb_y & b_interlaced )
          return;
@@ -401,20 +442,22 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
              height += PADV-8;
          for( y = start; y < height; y++ )
          {
-            uint8_t  *ref  = frame->plane[0] + y * stride - PADH;
-            uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
-            uint16_t v = line[0] = 0;
-            for( x = 1; x < stride-1; x++ )
-                line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
-            line -= 8*stride;
-            if( y >= 9-PADV )
+            uint8_t  *pix  = frame->plane[0] + y * stride - PADH;
+            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
+            uint16_t *sum4;
+            if( h->frames.b_have_sub8x8_esa )
+            {
+                h->mc.integral_init4h( sum8, pix, stride );
+                sum8 -= 8*stride;
+                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
+                if( y >= 8-PADV )
+                    h->mc.integral_init4v( sum8, sum4, stride );
+            }
+            else
              {
-                uint16_t *sum4 = line + stride * (frame->i_lines[0] + PADV*2);
-                for( x = 1; x < stride-8; x++, line++, sum4++ )
-                {
-                    sum4[0] =  line[4+4*stride] - line[4] - line[4*stride] + line[0];
-                    line[0] += line[8+8*stride] - line[8] - line[8*stride];
-                }
+                h->mc.integral_init8h( sum8, pix, stride );
+                if( y >= 8-PADV )
+                    h->mc.integral_init8v( sum8-8*stride, stride );
              }
          }
      }
diff --git a/common/mc.h b/common/mc.h

index 57c596cf522676e9bad7700960df1d5a468031d2..884d01659772d6494e5d57f94fedfeca484ab748 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -66,6 +66,12 @@ typedef struct
      void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
      void (*memzero_aligned)( void *dst, int n );
  
+    /* successive elimination prefilter */
+    void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride );
+    void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride );
+    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
+    void (*integral_init8v)( uint16_t *sum8, int stride );
+
      void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                                      int src_stride, int dst_stride, int width, int height );
  } x264_mc_functions_t;
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index fa3e3bd9df1d8ba8e7db7a5253af935ace3996bc..82daf2ce03bfa01b776c215424b9fe95ffb529d2 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -694,6 +694,104 @@ MEMZERO sse2
  
  
  
+;-----------------------------------------------------------------------------
+; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4h_sse4, 3,4
+    lea     r3, [r0+r2*2]
+    add     r1, r2
+    neg     r2
+    pxor    m4, m4
+.loop:
+    movdqa  m0, [r1+r2]
+    movdqu  m1, [r1+r2+8]
+    mpsadbw m0, m4, 0
+    mpsadbw m1, m4, 0
+    paddw   m0, [r0+r2*2]
+    paddw   m1, [r0+r2*2+16]
+    movdqa  [r3+r2*2   ], m0
+    movdqa  [r3+r2*2+16], m1
+    add     r2, 16
+    jl .loop
+    REP_RET
+
+cglobal x264_integral_init8h_sse4, 3,4
+    lea     r3, [r0+r2*2]
+    add     r1, r2
+    neg     r2
+    pxor    m4, m4
+.loop:
+    movdqa  m0, [r1+r2]
+    movdqu  m1, [r1+r2+8]
+    movdqa  m2, m0
+    movdqa  m3, m1
+    mpsadbw m0, m4, 0
+    mpsadbw m1, m4, 0
+    mpsadbw m2, m4, 4
+    mpsadbw m3, m4, 4
+    paddw   m0, [r0+r2*2]
+    paddw   m1, [r0+r2*2+16]
+    paddw   m0, m2
+    paddw   m1, m3
+    movdqa  [r3+r2*2   ], m0
+    movdqa  [r3+r2*2+16], m1
+    add     r2, 16
+    jl .loop
+    REP_RET
+
+%macro INTEGRAL_INIT 1
+;-----------------------------------------------------------------------------
+; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4v_%1, 3,5
+    shl   r2, 1
+    add   r0, r2
+    add   r1, r2
+    lea   r3, [r0+r2*4]
+    lea   r4, [r0+r2*8]
+    neg   r2
+.loop:
+    movu  m0, [r0+r2+8]
+    mova  m2, [r0+r2]
+    movu  m1, [r4+r2+8]
+    paddw m0, m2
+    paddw m1, [r4+r2]
+    mova  m3, [r3+r2]
+    psubw m1, m0
+    psubw m3, m2
+    mova  [r0+r2], m1
+    mova  [r1+r2], m3
+    add   r2, mmsize
+    jl .loop
+    REP_RET
+
+;-----------------------------------------------------------------------------
+; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init8v_%1, 3,3
+    shl   r1, 1
+    add   r0, r1
+    lea   r2, [r0+r1*8]
+    neg   r1
+.loop:
+    mova  m0, [r2+r1]
+    mova  m1, [r2+r1+mmsize]
+    psubw m0, [r0+r1]
+    psubw m1, [r0+r1+mmsize]
+    mova  [r0+r1], m0
+    mova  [r0+r1+mmsize], m1
+    add   r1, 2*mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_MMX
+INTEGRAL_INIT mmx
+INIT_XMM
+INTEGRAL_INIT sse2
+
+
+
  %macro FILT8x4 7
      mova      %3, [r0+%7]
      mova      %4, [r0+r5+%7]
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 0ec7adef074bfa25326d87b821e5c7605703478e..56ca4c4e3bd47a8043482612ec71118e8157eac4 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -64,6 +64,12 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
  extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
  extern void x264_memzero_aligned_mmx( void * dst, int n );
  extern void x264_memzero_aligned_sse2( void * dst, int n );
+extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
  #define LOWRES(cpu) \
  extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
                                                 int src_stride, int dst_stride, int width, int height );
@@ -242,6 +248,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
      pf->memcpy_aligned = x264_memcpy_aligned_mmx;
      pf->memzero_aligned = x264_memzero_aligned_mmx;
+    pf->integral_init4v = x264_integral_init4v_mmx;
+    pf->integral_init8v = x264_integral_init8v_mmx;
  
      if( !(cpu&X264_CPU_MMXEXT) )
          return;
@@ -286,6 +294,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  
      pf->memcpy_aligned = x264_memcpy_aligned_sse2;
      pf->memzero_aligned = x264_memzero_aligned_sse2;
+    pf->integral_init4v = x264_integral_init4v_sse2;
+    pf->integral_init8v = x264_integral_init8v_sse2;
      pf->hpel_filter = x264_hpel_filter_sse2_amd;
  
      if( cpu&X264_CPU_SSE2_IS_SLOW )
@@ -331,4 +341,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->hpel_filter = x264_hpel_filter_ssse3;
      pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
      pf->mc_chroma = x264_mc_chroma_ssse3;
+
+    if( !(cpu&X264_CPU_SSE4) )
+        return;
+
+    pf->integral_init4h = x264_integral_init4h_sse4;
+    pf->integral_init8h = x264_integral_init8h_sse4;
  }
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 73d33f4a61c7de2ffdf0fc7788a2e9187795dba7..0a91134ec65610fec48caa2f59911f55092c4558 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -713,6 +713,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
            || h->param.i_bframe_adaptive
            || h->param.b_pre_scenecut );
      h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+    h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
  
      h->frames.i_last_idr = - h->param.i_keyint_max;
      h->frames.i_input    = 0;
@@ -839,6 +840,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
      // can only twiddle these if they were enabled to begin with:
      if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
          COPY( analyse.i_me_method );
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
+        h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
      if( h->pps->b_transform_8x8_mode )
          COPY( analyse.b_transform_8x8 );
      if( h->frames.i_max_ref1 > 1 )
diff --git a/tools/checkasm.c b/tools/checkasm.c

index e810cdcca4ffb2fbc59e7671290a9e7a28cb88e0..1c1731997cb0bfa16269135d067615ff00002a9b 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -822,33 +822,57 @@ static int check_mc( int cpu_ref, int cpu_new )
          uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
          uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
          set_func_name( "lowres_init" );
+        ok = 1; used_asm = 1;
          for( w=40; w<=48; w+=8 )
-            if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
+        {
+            int stride = (w+8)&~15;
+            call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
+            call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+            for( i=0; i<16; i++)
              {
-                int stride = (w+8)&~15;
-                used_asm = 1;
-                call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
-                call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
-                for( i=0; i<16; i++)
-                {
-                    for( j=0; j<4; j++)
-                        if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
-                        {
-                            ok = 0;
-                            fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
-                            for( k=0; k<w; k++ )
-                                printf( "%d ", dstc[j][k+i*stride] );
-                            printf("\n");
-                            for( k=0; k<w; k++ )
-                                printf( "%d ", dsta[j][k+i*stride] );
-                            printf("\n");
-                            break;
-                        }
-                }
+                for( j=0; j<4; j++)
+                    if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
+                    {
+                        ok = 0;
+                        fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
+                        for( k=0; k<w; k++ )
+                            printf( "%d ", dstc[j][k+i*stride] );
+                        printf("\n");
+                        for( k=0; k<w; k++ )
+                            printf( "%d ", dsta[j][k+i*stride] );
+                        printf("\n");
+                        break;
+                    }
              }
+        }
          report( "lowres init :" );
      }
  
+#define INTEGRAL_INIT( name, size, ... )\
+    if( mc_a.name != mc_ref.name )\
+    {\
+        int stride = 80;\
+        set_func_name( #name );\
+        used_asm = 1;\
+        memcpy( buf3, buf1, size*2*stride );\
+        memcpy( buf4, buf1, size*2*stride );\
+        uint16_t *sum = (uint16_t*)buf3;\
+        call_c1( mc_c.name, __VA_ARGS__ );\
+        sum = (uint16_t*)buf4;\
+        call_a1( mc_a.name, __VA_ARGS__ );\
+        if( memcmp( buf3, buf4, (stride-8)*2 )\
+            || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
+            ok = 0;\
+        call_c2( mc_c.name, __VA_ARGS__ );\
+        call_a2( mc_a.name, __VA_ARGS__ );\
+    }
+    ok = 1; used_asm = 0;
+    INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride );
+    INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride );
+    INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
+    INTEGRAL_INIT( integral_init8v, 9, sum, stride );
+    report( "integral init :" );
+
      return ret;
  }
author	Loren Merritt <pengvado@akuvian.org>
	Thu, 11 Dec 2008 19:47:17 +0000 (19:47 +0000)
committer	Loren Merritt <pengvado@akuvian.org>
	Mon, 22 Dec 2008 00:20:08 +0000 (00:20 +0000)
common/common.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history