Some initial 4:2:2 x86 asm

author Fiona Glaser <fiona@x264.com>

Sun, 4 Sep 2011 09:31:29 +0000 (11:31 +0200)

committer Fiona Glaser <fiona@x264.com>

Wed, 21 Sep 2011 16:54:47 +0000 (09:54 -0700)
author Fiona Glaser <fiona@x264.com>
Sun, 4 Sep 2011 09:31:29 +0000 (11:31 +0200)
committer Fiona Glaser <fiona@x264.com>
Wed, 21 Sep 2011 16:54:47 +0000 (09:54 -0700)
diff --git a/common/pixel.c b/common/pixel.c

index b346681b4bd36a7d8e22acca4211457a85f96e06..065582688bc6bb372485672a480225996da704f4 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -931,13 +931,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  #if HAVE_MMX
      if( cpu&X264_CPU_MMX )
      {
-        INIT7( ssd, _mmx );
+        INIT8( ssd, _mmx );
      }
  
      if( cpu&X264_CPU_MMX2 )
      {
-        INIT7( sad, _mmx2 );
-        INIT7_NAME( sad_aligned, sad, _mmx2 );
+        INIT8( sad, _mmx2 );
+        INIT8_NAME( sad_aligned, sad, _mmx2 );
          INIT7( sad_x3, _mmx2 );
          INIT7( sad_x4, _mmx2 );
          INIT7( satd, _mmx2 );
@@ -946,6 +946,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          INIT4( hadamard_ac, _mmx2 );
          INIT_ADS( _mmx2 );
          pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
+        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_mmx2;
          pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
          pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_mmx2;
  #if ARCH_X86
@@ -1020,6 +1021,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          }
          INIT_ADS( _sse2 );
          pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
+        pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
          pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
          if( cpu&X264_CPU_CACHELINE_64 )
          {
@@ -1073,7 +1075,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          INIT_ADS( _ssse3 );
          if( !(cpu&X264_CPU_SLOW_ATOM) )
          {
-            INIT7( ssd, _ssse3 );
+            INIT8( ssd, _ssse3 );
              pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
              pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
              INIT7( satd, _ssse3 );
@@ -1139,6 +1141,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  #endif
          pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
          pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_avx;
          pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
          pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
          pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
diff --git a/common/predict.c b/common/predict.c

index f5ed64260cd639201a876dc526eed3303fb8405b..8b8a6c5d2e280f11639c8ee415f9a78f5c97545a 100644 (file)
--- a/common/predict.c
+++ b/common/predict.c
@@ -933,6 +933,10 @@ void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
      pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
      pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
      pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;
+
+#if HAVE_MMX
+    x264_predict_8x16c_init_mmx( cpu, pf );
+#endif
  }
  
  void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 6043a89583dc321e1ee9031faffe13b36d92c04c..0a513e1158d7cee0e3168e6a893af7c3a88a7c5b 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -617,6 +617,7 @@ cglobal pixel_avg_w%1
  
  INIT_MMX mmx2
  AVG_FUNC 4, movq, movq
+AVGH 4, 16
  AVGH 4, 8
  AVGH 4, 4
  AVGH 4, 2
@@ -632,6 +633,7 @@ AVGH 16,  8
  
  INIT_XMM sse2
  AVG_FUNC 4, movq, movq
+AVGH  4, 16
  AVGH  4, 8
  AVGH  4, 4
  AVGH  4, 2
@@ -649,6 +651,7 @@ AVGH  16,  8
  
  INIT_MMX mmx2
  AVG_FUNC 4, movd, movd
+AVGH 4, 16
  AVGH 4, 8
  AVGH 4, 4
  AVGH 4, 2
@@ -676,6 +679,7 @@ AVGH  8, 16
  AVGH  8,  8
  AVGH  8,  4
  INIT_MMX ssse3
+AVGH  4, 16
  AVGH  4,  8
  AVGH  4,  4
  AVGH  4,  2
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 6a730475771be461de4fd741da8707f58620d392..1a26e44fc81dd9d44f3df70670acef5664487620 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -42,6 +42,7 @@ DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, int, pixel *, int, pixel *, int, int
  DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
  DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
  DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
+DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
  DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
  DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
  DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, int, pixel *, int, pixel *, int, int ))
@@ -526,6 +527,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmx2;
      pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_mmx2;
      pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_mmx2;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_mmx2;
      pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_mmx2;
      pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_mmx2;
      pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_mmx2;
@@ -582,6 +584,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_sse2;
      pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_sse2;
      pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_sse2;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sse2;
      pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sse2;
      pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sse2;
      pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sse2;
@@ -691,6 +694,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_ssse3;
      pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_ssse3;
      pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_ssse3;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_ssse3;
      pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
      pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
      pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm

index 21170cdb688aa1483261c63141e0cbbccf322d41..b094c9c247b199fb9b8f69c46de9f1e318533c75 100644 (file)
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -384,6 +384,7 @@ SSD  8, 16
  SSD  4,  4
  SSD  8,  4
  SSD  4,  8
+SSD  4, 16
  INIT_XMM sse2slow
  SSD 16, 16
  SSD  8,  8
@@ -415,6 +416,7 @@ SSD  8,  4
  INIT_MMX ssse3
  SSD  4,  4
  SSD  4,  8
+SSD  4, 16
  %assign function_align 16
  %endif ; !HIGH_BIT_DEPTH
  
@@ -659,6 +661,12 @@ cglobal pixel_var_16x16_mmx2, 2,3
      VAR_2ROW 8*SIZEOF_PIXEL, 16
      VAR_END 16, 16
  
+cglobal pixel_var_8x16_mmx2, 2,3
+    FIX_STRIDES r1
+    VAR_START 0
+    VAR_2ROW r1, 8
+    VAR_END 8, 16
+
  cglobal pixel_var_8x8_mmx2, 2,3
      FIX_STRIDES r1
      VAR_START 0
@@ -726,6 +734,22 @@ cglobal pixel_var_8x8, 2,4,8
      dec r2d
      jg .loop
      VAR_END 8, 8
+
+cglobal pixel_var_8x16, 2,4,8
+    VAR_START 1
+    mov      r2d, 4
+    lea       r3, [r1*3]
+.loop:
+    movh      m0, [r0]
+    movh      m3, [r0+r1]
+    movhps    m0, [r0+r1*2]
+    movhps    m3, [r0+r3]
+    DEINTB    1, 0, 4, 3, 7
+    lea       r0, [r0+r1*4]
+    VAR_CORE
+    dec r2d
+    jg .loop
+    VAR_END 8, 16
  %endmacro ; VAR
  
  INIT_XMM sse2
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index d5cdb4a442c256746f153a03962ee4da39594bdc..c56240185a2c90ea5fdf14ab913e2a5a46da8aa5 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -34,6 +34,7 @@
      ret x264_pixel_##name##_8x16_##suffix args;\
      ret x264_pixel_##name##_8x8_##suffix args;\
      ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x16_##suffix args;\
      ret x264_pixel_##name##_4x8_##suffix args;\
      ret x264_pixel_##name##_4x4_##suffix args;\
  
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm

index c505573e5bd631519b39cf3ed8a12b7c08b5d0bb..f676c05a41fc32985ccf24cf5867f207d28505db 100644 (file)
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -56,7 +56,7 @@ cextern pw_16
  cextern pw_00ff
  cextern pw_pixel_max
  
-%macro STORE8x8 2
+%macro STORE8x8 2-4
      add r0, 4*FDEC_STRIDEB
      mova        [r0 + -4*FDEC_STRIDEB], %1
      mova        [r0 + -3*FDEC_STRIDEB], %1
@@ -68,6 +68,28 @@ cextern pw_pixel_max
      mova        [r0 +  3*FDEC_STRIDEB], %2
  %endmacro
  
+%macro STORE8x16 4
+    add r0, 4*FDEC_STRIDEB
+    mova        [r0 + -4*FDEC_STRIDEB], %1
+    mova        [r0 + -3*FDEC_STRIDEB], %1
+    mova        [r0 + -2*FDEC_STRIDEB], %1
+    mova        [r0 + -1*FDEC_STRIDEB], %1
+    add r0, 4*FDEC_STRIDEB
+    mova        [r0 + -4*FDEC_STRIDEB], %2
+    mova        [r0 + -3*FDEC_STRIDEB], %2
+    mova        [r0 + -2*FDEC_STRIDEB], %2
+    mova        [r0 + -1*FDEC_STRIDEB], %2
+    add r0, 4*FDEC_STRIDEB
+    mova        [r0 + -4*FDEC_STRIDEB], %3
+    mova        [r0 + -3*FDEC_STRIDEB], %3
+    mova        [r0 + -2*FDEC_STRIDEB], %3
+    mova        [r0 + -1*FDEC_STRIDEB], %3
+    mova        [r0 +  0*FDEC_STRIDEB], %4
+    mova        [r0 +  1*FDEC_STRIDEB], %4
+    mova        [r0 +  2*FDEC_STRIDEB], %4
+    mova        [r0 +  3*FDEC_STRIDEB], %4
+%endmacro
+
  %macro STORE16x16 2-4
  %ifidn %0, 4
      mov         r1d, 8
@@ -1569,44 +1591,77 @@ cglobal predict_8x8c_v_mmx, 1,1
  
  %endif
  
+%macro PREDICT_8x16C_V 0
+cglobal predict_8x16c_v, 1,1
+    mova        m0, [r0 - FDEC_STRIDEB]
+    STORE8x16    m0, m0, m0, m0
+    RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_8x16C_V
+%else
+INIT_MMX mmx
+PREDICT_8x16C_V
+%endif
+
  ;-----------------------------------------------------------------------------
  ; void predict_8x8c_h( uint8_t *src )
  ;-----------------------------------------------------------------------------
  %ifdef HIGH_BIT_DEPTH
  
  INIT_XMM sse2
-cglobal predict_8x8c_h, 1,1
+%macro PREDICT_C_H 1
+cglobal predict_8x%1c_h, 1,1
      add        r0, FDEC_STRIDEB*4
  %assign Y -4
-%rep 8
+%rep %1
      movd       m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
      SPLATW     m0, m0, 1
      mova [r0+FDEC_STRIDEB*Y], m0
  %assign Y Y+1
  %endrep
      RET
+%endmacro
+
+PREDICT_C_H 8
+PREDICT_C_H 16
  
  %else ; !HIGH_BIT_DEPTH
  
-%macro PREDICT_8x8C_H 0
-cglobal predict_8x8c_h, 1,1
-%if cpuflag(ssse3)
-    mova   m1, [pb_3]
-%endif
-    add    r0, FDEC_STRIDE*4
-%assign Y -4
-%rep 8
+%macro PREDICT_C_H_CORE 1
+%assign Y %1
+%rep 4
      SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
      mova [r0+FDEC_STRIDE*Y], m0
  %assign Y Y+1
  %endrep
+%endmacro
+
+%macro PREDICT_C_H 1
+cglobal predict_8x%1c_h, 1,1
+%if cpuflag(ssse3)
+    mova   m1, [pb_3]
+%endif
+%if %1==16
+    add    r0, FDEC_STRIDE*4
+    PREDICT_C_H_CORE -4
+    add    r0, FDEC_STRIDE*4
+    PREDICT_C_H_CORE -4
+%endif
+    add    r0, FDEC_STRIDE*4
+    PREDICT_C_H_CORE -4
+    PREDICT_C_H_CORE 0
      RET
  %endmacro
  
  INIT_MMX mmx2
-PREDICT_8x8C_H
+PREDICT_C_H 8
+PREDICT_C_H 16
  INIT_MMX ssse3
-PREDICT_8x8C_H
+PREDICT_C_H 8
+PREDICT_C_H 16
  
  %endif
  ;-----------------------------------------------------------------------------
@@ -1704,9 +1759,10 @@ INIT_MMX sse2
  PREDICT_8x8C_DC
  %endif
  
+%macro PREDICT_C_DC_TOP 1
  %ifdef HIGH_BIT_DEPTH
  INIT_XMM
-cglobal predict_8x8c_dc_top_sse2, 1,1
+cglobal predict_8x%1c_dc_top_sse2, 1,1
      pxor        m2, m2
      mova        m0, [r0 - FDEC_STRIDEB]
      pshufd      m1, m0, q2301
@@ -1716,11 +1772,11 @@ cglobal predict_8x8c_dc_top_sse2, 1,1
      paddw       m0, m1
      psrlw       m0, 1
      pavgw       m0, m2
-    STORE8x8    m0, m0
+    STORE8x%1   m0, m0, m0, m0
      RET
  %else ; !HIGH_BIT_DEPTH
  INIT_MMX
-cglobal predict_8x8c_dc_top_mmx2, 1,1
+cglobal predict_8x%1c_dc_top_mmx2, 1,1
      movq        mm0, [r0 - FDEC_STRIDE]
      pxor        mm1, mm1
      pxor        mm2, mm2
@@ -1735,9 +1791,13 @@ cglobal predict_8x8c_dc_top_mmx2, 1,1
      pshufw      mm1, mm1, 0
      pshufw      mm0, mm0, 0     ; dc0 (w)
      packuswb    mm0, mm1        ; dc0,dc1 (b)
-    STORE8x8    mm0, mm0
+    STORE8x%1   mm0, mm0, mm0, mm0
      RET
  %endif
+%endmacro
+
+PREDICT_C_DC_TOP 8
+PREDICT_C_DC_TOP 16
  
  ;-----------------------------------------------------------------------------
  ; void predict_16x16_v( pixel *src )
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c

index d0b7d3954a5283c8e2bca87f2de99c104fad9a07..38f7081044902e592e47eaf326156627a5ea9724 100644 (file)
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -362,6 +362,28 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
  #endif // HIGH_BIT_DEPTH
  }
  
+void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
+{
+    if( !(cpu&X264_CPU_MMX) )
+        return;
+#if HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_sse2;
+    if( !(cpu&X264_CPU_SSE2) )
+        return;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_sse2;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_sse2;
+#else
+    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_mmx;
+    if( !(cpu&X264_CPU_MMX2) )
+        return;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_mmx2;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_mmx2;
+    if( !(cpu&X264_CPU_SSSE3) )
+        return;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_ssse3;
+#endif // HIGH_BIT_DEPTH
+}
+
  void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
  {
      if( !(cpu&X264_CPU_MMX2) )
diff --git a/common/x86/predict.h b/common/x86/predict.h

index fba2f75fc40bd529a5846ca7d1cd73c2c1285fd5..4bf639d222b9e0e46c64da36e633c387f4f9c84e 100644 (file)
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -28,6 +28,7 @@
  #define X264_I386_PREDICT_H
  
  void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init_mmx  ( int cpu, x264_predict_t pf[7] );
  void x264_predict_8x8c_init_mmx  ( int cpu, x264_predict_t pf[7] );
  void x264_predict_4x4_init_mmx   ( int cpu, x264_predict_t pf[12] );
  void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
@@ -49,6 +50,13 @@ void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
  void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
  void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
  void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
+void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
+void x264_predict_8x16c_v_mmx( uint8_t *src );
+void x264_predict_8x16c_v_sse2( uint16_t *src );
+void x264_predict_8x16c_h_mmx2( uint8_t *src );
+void x264_predict_8x16c_h_sse2( pixel *src );
+void x264_predict_8x16c_h_ssse3( uint8_t *src );
  void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
  void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
  void x264_predict_8x8c_dc_mmx2( pixel *src );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index 743afcda460b1ff8e1ba5c64d6a8e4f22f5daedf..2453e4c3bd7d655c52719c78a2104d94bdb54b01 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -110,6 +110,7 @@ SAD 16,  8
  SAD  8, 16
  SAD  8,  8
  SAD  8,  4
+SAD  4, 16
  SAD  4,  8
  SAD  4,  4
author	Fiona Glaser <fiona@x264.com>
	Sun, 4 Sep 2011 09:31:29 +0000 (11:31 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 21 Sep 2011 16:54:47 +0000 (09:54 -0700)
common/pixel.c		patch \| blob \| history
common/predict.c		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/pixel-a.asm		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
common/x86/predict-a.asm		patch \| blob \| history
common/x86/predict-c.c		patch \| blob \| history
common/x86/predict.h		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history