intra_sad_x3_4x4 assembly

author Fiona Glaser <fiona@x264.com>

Mon, 30 Mar 2009 23:37:46 +0000 (16:37 -0700)

committer Fiona Glaser <fiona@x264.com>

Mon, 30 Mar 2009 23:37:46 +0000 (16:37 -0700)
author Fiona Glaser <fiona@x264.com>
Mon, 30 Mar 2009 23:37:46 +0000 (16:37 -0700)
committer Fiona Glaser <fiona@x264.com>
Mon, 30 Mar 2009 23:37:46 +0000 (16:37 -0700)
diff --git a/common/pixel.c b/common/pixel.c

index 24bf430715bc395765a31f1d354d5f5a74a31ed6..72b5e6fd3002dcedc171b93e314d92df58c963ec 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -666,6 +666,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
          pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmxext;
          pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
+        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmxext;
      }
  
      if( cpu&X264_CPU_SSE2 )
diff --git a/common/pixel.h b/common/pixel.h

index f1c901e30a85e61864ab94103fbf7a988ed1bfc4..9ce368be4eceb2796b72cd8f79ab70b79dad12d9 100644 (file)
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -100,7 +100,9 @@ typedef struct
      void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
      void (*intra_satd_x3_8x8c)  ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
      void (*intra_sad_x3_8x8c)   ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_mbcmp_x3_4x4)  ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
      void (*intra_satd_x3_4x4)   ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_sad_x3_4x4)    ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
      void (*intra_sa8d_x3_8x8)   ( uint8_t *fenc, uint8_t edge[33], int res[3] );
  } x264_pixel_function_t;
  
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index 1e04dcdaa5ecf12ae7f86d2c55942f68204e14c5..dbc04fe2fb16c20d0380d4fec85573aacc939043 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -79,6 +79,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, sse4,   ( uint8_t *pix, int i_stride ))
  
  void x264_intra_satd_x3_4x4_mmxext  ( uint8_t *, uint8_t *, int * );
  void x264_intra_satd_x3_4x4_ssse3   ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_4x4_mmxext   ( uint8_t *, uint8_t *, int * );
  void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
  void x264_intra_satd_x3_8x8c_ssse3  ( uint8_t *, uint8_t *, int * );
  void x264_intra_sad_x3_8x8c_mmxext  ( uint8_t *, uint8_t *, int * );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index 2eb260b99357e2b2cbdf656962ddfe118103be6b..ba1cbe4829ebfa83c471438513dda12351782e8b 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -257,6 +257,53 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4
      SAD_END_SSE2
      RET
  
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
+    pxor      mm7, mm7
+    movd      mm0, [r1-FDEC_STRIDE]
+    movd      mm1, [r0+FENC_STRIDE*0]
+    movd      mm2, [r0+FENC_STRIDE*2]
+    punpckldq mm0, mm0
+    punpckldq mm1, [r0+FENC_STRIDE*1]
+    punpckldq mm2, [r0+FENC_STRIDE*3]
+    movq      mm6, mm0
+    movq      mm3, mm1
+    psadbw    mm3, mm0
+    psadbw    mm0, mm2
+    paddw     mm0, mm3
+    movd     [r2], mm0 ;V prediction cost
+    movd      mm3, [r1+FDEC_STRIDE*0-4]
+    movd      mm0, [r1+FDEC_STRIDE*1-4]
+    movd      mm4, [r1+FDEC_STRIDE*2-4]
+    movd      mm5, [r1+FDEC_STRIDE*3-4]
+    punpcklbw mm3, mm0
+    punpcklbw mm4, mm5
+    movq      mm5, mm3
+    punpckhwd mm5, mm4
+    punpckhdq mm5, mm6
+    psadbw    mm5, mm7
+    punpckhbw mm3, mm3
+    punpckhbw mm4, mm4
+    punpckhwd mm3, mm3
+    punpckhwd mm4, mm4
+    psraw     mm5, 2
+    pavgw     mm5, mm7
+    punpcklbw mm5, mm5
+    pshufw    mm5, mm5, 0x0 ;DC prediction
+    movq      mm6, mm5
+    psadbw    mm5, mm1
+    psadbw    mm6, mm2
+    psadbw    mm1, mm3
+    psadbw    mm2, mm4
+    paddw     mm5, mm6
+    paddw     mm1, mm2
+    movd   [r2+8], mm5 ;DC prediction cost
+    movd   [r2+4], mm1 ;H prediction cost
+    RET
+
  ;-----------------------------------------------------------------------------
  ; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
  ;-----------------------------------------------------------------------------
diff --git a/encoder/analyse.c b/encoder/analyse.c

index b1df50b521e37f1d358f97349caad26103ce0e95..3bcde1191b42e60c30f4d1f5e9d78555d63c9e95 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -760,7 +760,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
          int i_cost;
          int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
          h->mb.i_cbp_luma = 0;
-        b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
+        b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
          if( a->i_mbrd )
              i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
  
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 30cc067ac1e2af062222e733b5ecc7a1f9f4e809..b181d1e79c372bc217571ee51cee5bdd1ef2bf0f 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -606,6 +606,7 @@ static void mbcmp_init( x264_t *h )
      memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
      h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
      h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
+    h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
      satd &= h->param.analyse.i_me_method == X264_ME_TESA;
      memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
      memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
diff --git a/tools/checkasm.c b/tools/checkasm.c

index e4a5f7c4e068ee81d0d360a3247a6ebc48d2c7ee..85a65c237da214209019c2a7f2778d7f806e01d9 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -406,6 +406,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
      report( "intra satd_x3 :" );
      TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
      TEST_INTRA_MBCMP( intra_sad_x3_8x8c  , predict_8x8c , sad [PIXEL_8x8]  , 0 );
+    TEST_INTRA_MBCMP( intra_sad_x3_4x4   , predict_4x4  , sad [PIXEL_4x4]  , 0 );
      report( "intra sad_x3 :" );
  
      if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
author	Fiona Glaser <fiona@x264.com>
	Mon, 30 Mar 2009 23:37:46 +0000 (16:37 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Mon, 30 Mar 2009 23:37:46 +0000 (16:37 -0700)
common/pixel.c		patch \| blob \| history
common/pixel.h		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history