x86: AVX2 high bit-depth predict_16x16_h

author Henrik Gramner <henrik@gramner.com>

Tue, 16 Apr 2013 21:27:04 +0000 (23:27 +0200)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:28 +0000 (14:36 -0700)
author Henrik Gramner <henrik@gramner.com>
Tue, 16 Apr 2013 21:27:04 +0000 (23:27 +0200)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:28 +0000 (14:36 -0700)
diff --git a/common/common.h b/common/common.h

index ecd743bb8d9eabf9a83bc07706d2d3878e2af510..53a6ff03c0b834f592867fbcb348e3a8e8a89649 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -753,7 +753,7 @@ struct x264_t
  #define FENC_STRIDE 16
  #define FDEC_STRIDE 32
              ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
+            ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
  
              /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
              ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm

index bff1441d3a944318968747d63ff288ba27614467..15bb0755e162a927d3446f4e6da87796838abdcd 100644 (file)
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -2031,46 +2031,23 @@ cglobal predict_16x16_v, 1,1
  ;-----------------------------------------------------------------------------
  %macro PREDICT_16x16_H 0
  cglobal predict_16x16_h, 1,2
-    mov r1, 12*FDEC_STRIDEB
-%if HIGH_BIT_DEPTH
-.vloop:
-%assign Y 0
-%rep 4
-    movd        m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
-    SPLATW      m0, m0, 1
-    mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
-    mova [r0+r1+Y*FDEC_STRIDEB+16], m0
-%if mmsize==8
-    mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
-    mova [r0+r1+Y*FDEC_STRIDEB+24], m0
-%endif
-%assign Y Y+1
-%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if cpuflag(ssse3)
-    mova   m1, [pb_3]
-%endif
-.vloop:
-%assign Y 0
-%rep 4
-    SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
-    mova [r0+r1+FDEC_STRIDE*Y], m0
-%if mmsize==8
-    mova [r0+r1+FDEC_STRIDE*Y+8], m0
+%if cpuflag(ssse3) && notcpuflag(avx2)
+    mova  m2, [pb_3]
  %endif
-%assign Y Y+1
-%endrep
-%endif ; HIGH_BIT_DEPTH
-    sub r1, 4*FDEC_STRIDEB
-    jge .vloop
+    mov  r1d, 4
+.loop:
+    PRED_H_4ROWS 16, 1
+    dec  r1d
+    jg .loop
      RET
  %endmacro
  
  INIT_MMX mmx2
  PREDICT_16x16_H
-INIT_XMM sse2
  %if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_16x16_H
+INIT_YMM avx2
  PREDICT_16x16_H
  %else
  ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c

index 99ce9582b7e4f53a5379299080ce736b14bd04f1..58b8660b69cefbf94ed8bc224c7379eb663204dd 100644 (file)
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -341,6 +341,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
  #if HAVE_X86_INLINE_ASM
      pf[I_PRED_16x16_P]       = x264_predict_16x16_p_sse2;
  #endif
+    if( !(cpu&X264_CPU_AVX2) )
+        return;
+    pf[I_PRED_16x16_H]       = x264_predict_16x16_h_avx2;
  #else
  #if !ARCH_X86_64
      pf[I_PRED_16x16_P]       = x264_predict_16x16_p_mmx2;
diff --git a/common/x86/predict.h b/common/x86/predict.h

index d3e415da89722b5c496d2d0ce92d10337e1238e7..8f1072114ad1cfce2440b692b9b1f7d59b2f89a4 100644 (file)
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -38,6 +38,7 @@ void x264_predict_16x16_v_sse ( pixel *src );
  void x264_predict_16x16_h_mmx2( pixel *src );
  void x264_predict_16x16_h_sse2( uint16_t *src );
  void x264_predict_16x16_h_ssse3( uint8_t *src );
+void x264_predict_16x16_h_avx2( uint16_t *src );
  void x264_predict_16x16_dc_mmx2( pixel *src );
  void x264_predict_16x16_dc_sse2( pixel *src );
  void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 26c86a18ac184cd05cd18c81128add8243c0ada3..1b02255bb750dbc0d2e580ec01802d06c96f1e11 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2147,7 +2147,7 @@ static int check_intra( int cpu_ref, int cpu_new )
      int ret = 0, ok = 1, used_asm = 0;
      ALIGNED_ARRAY_32( pixel, edge,[36] );
      ALIGNED_ARRAY_32( pixel, edge2,[36] );
-    ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
+    ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
      struct
      {
          x264_predict_t      predict_16x16[4+3];
author	Henrik Gramner <henrik@gramner.com>
	Tue, 16 Apr 2013 21:27:04 +0000 (23:27 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:28 +0000 (14:36 -0700)
common/common.h		patch \| blob \| history
common/x86/predict-a.asm		patch \| blob \| history
common/x86/predict-c.c		patch \| blob \| history
common/x86/predict.h		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history