avutil/mips: optimize UNPCK&SAD macros with MSA2.0 instruction.

author Shiyou Yin <yinshiyou-hf@loongson.cn>

Tue, 9 Jul 2019 12:43:37 +0000 (20:43 +0800)

committer Michael Niedermayer <michael@niedermayer.cc>

Wed, 10 Jul 2019 10:54:57 +0000 (12:54 +0200)
author Shiyou Yin <yinshiyou-hf@loongson.cn>
Tue, 9 Jul 2019 12:43:37 +0000 (20:43 +0800)
committer Michael Niedermayer <michael@niedermayer.cc>
Wed, 10 Jul 2019 10:54:57 +0000 (12:54 +0200)
diff --git a/configure b/configure

index 4005987409e20d54a5bf1bb67f56c07ee4ccd2e9..5a4f507246b8881fa349ff96c34f566b84eab0ae 100755 (executable)
--- a/configure
+++ b/configure
@@ -441,6 +441,7 @@ Optimization options (experts only):
    --disable-mipsdsp        disable MIPS DSP ASE R1 optimizations
    --disable-mipsdspr2      disable MIPS DSP ASE R2 optimizations
    --disable-msa            disable MSA optimizations
+  --disable-msa2           disable MSA2 optimizations
    --disable-mipsfpu        disable floating point MIPS optimizations
    --disable-mmi            disable Loongson SIMD optimizations
    --disable-fast-unaligned consider unaligned accesses slow
@@ -1999,6 +2000,7 @@ ARCH_EXT_LIST_MIPS="
      mipsdsp
      mipsdspr2
      msa
+    msa2
  "
  
  ARCH_EXT_LIST_LOONGSON="
@@ -2527,6 +2529,7 @@ mipsdsp_deps="mips"
  mipsdspr2_deps="mips"
  mmi_deps="mips"
  msa_deps="mipsfpu"
+msa2_deps="msa"
  
  cpunop_deps="i686"
  x86_64_select="i686"
@@ -5753,6 +5756,7 @@ elif enabled mips; then
      enabled mipsfpu && enabled msa && check_inline_asm_flags msa '"addvi.b $w0, $w1, 1"' '-mmsa' && check_headers msa.h || disable msa
      enabled mipsdsp && check_inline_asm_flags mipsdsp '"addu.qb $t0, $t1, $t2"' '-mdsp'
      enabled mipsdspr2 && check_inline_asm_flags mipsdspr2 '"absq_s.qb $t0, $t1"' '-mdspr2'
+    enabled msa && enabled msa2 && check_inline_asm_flags msa2 '"nxbits.any.b $w0, $w0"' '-mmsa2' && check_headers msa2.h || disable msa2
  
      if enabled bigendian && enabled msa; then
          disable msa
@@ -7128,6 +7132,7 @@ if enabled mips; then
      echo "MIPS DSP R1 enabled       ${mipsdsp-no}"
      echo "MIPS DSP R2 enabled       ${mipsdspr2-no}"
      echo "MIPS MSA enabled          ${msa-no}"
+    echo "MIPS MSA2 enabled         ${msa2-no}"
      echo "LOONGSON MMI enabled      ${mmi-no}"
  fi
  if enabled ppc; then
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h

index 6a467046631f4b98e4b1ae22f1834881b75b87f9..a3774281f9e41eb893c667244f93c2c3850c505b 100644 (file)
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -23,6 +23,11 @@
  
  #include <stdint.h>
  #include <msa.h>
+#include <config.h>
+
+#if HAVE_MSA2
+#include <msa2.h>
+#endif
  
  #define ALIGNMENT           16
  #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
@@ -1234,6 +1239,15 @@
                   unsigned absolute diff values, even-odd pairs are added
                   together to generate 8 halfword results.
  */
+#if HAVE_MSA2
+#define SAD_UB2_UH(in0, in1, ref0, ref1)                                 \
+( {                                                                      \
+    v8u16 sad_m = { 0 };                                                 \
+    sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
+    sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
+    sad_m;                                                               \
+} )
+#else
  #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
  ( {                                                             \
      v16u8 diff0_m, diff1_m;                                     \
@@ -1247,6 +1261,7 @@
                                                                  \
      sad_m;                                                      \
  } )
+#endif // #if HAVE_MSA2
  
  /* Description : Insert specified word elements from input vectors to 1
                   destination vector
@@ -2287,6 +2302,12 @@
                   extracted and interleaved with same vector 'in0' to generate
                   4 word elements keeping sign intact
  */
+#if HAVE_MSA2
+#define UNPCK_R_SH_SW(in, out)                           \
+{                                                        \
+    out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
+}
+#else
  #define UNPCK_R_SH_SW(in, out)                       \
  {                                                    \
      v8i16 sign_m;                                    \
@@ -2294,6 +2315,7 @@
      sign_m = __msa_clti_s_h((v8i16) in, 0);          \
      out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
  }
+#endif // #if HAVE_MSA2
  
  /* Description : Sign extend byte elements from input vector and return
                   halfword results in pair of vectors
@@ -2306,6 +2328,13 @@
                   Then interleaved left with same vector 'in0' to
                   generate 8 signed halfword elements in 'out1'
  */
+#if HAVE_MSA2
+#define UNPCK_SB_SH(in, out0, out1)                       \
+{                                                         \
+    out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
+    out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
+}
+#else
  #define UNPCK_SB_SH(in, out0, out1)                  \
  {                                                    \
      v16i8 tmp_m;                                     \
@@ -2313,6 +2342,7 @@
      tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
      ILVRL_B2_SH(tmp_m, in, out0, out1);              \
  }
+#endif // #if HAVE_MSA2
  
  /* Description : Zero extend unsigned byte elements to halfword elements
     Arguments   : Inputs  - in           (1 input unsigned byte vector)
@@ -2339,6 +2369,13 @@
                   Then interleaved left with same vector 'in0' to
                   generate 4 signed word elements in 'out1'
  */
+#if HAVE_MSA2
+#define UNPCK_SH_SW(in, out0, out1)                       \
+{                                                         \
+    out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
+    out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
+}
+#else
  #define UNPCK_SH_SW(in, out0, out1)                  \
  {                                                    \
      v8i16 tmp_m;                                     \
@@ -2346,6 +2383,7 @@
      tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
      ILVRL_H2_SW(tmp_m, in, out0, out1);              \
  }
+#endif // #if HAVE_MSA2
  
  /* Description : Swap two variables
     Arguments   : Inputs  - in0, in1
@@ -2850,13 +2888,11 @@
  */
  #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
  ( {                                                                 \
-    v8i16 tmp1_m;                                                   \
      v8i16 out0_m;                                                   \
                                                                      \
      out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
      out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
-    tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2);           \
-    out0_m = __msa_adds_s_h(out0_m, tmp1_m);                        \
+    out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
                                                                      \
      out0_m;                                                         \
  } )
author	Shiyou Yin <yinshiyou-hf@loongson.cn>
	Tue, 9 Jul 2019 12:43:37 +0000 (20:43 +0800)
committer	Michael Niedermayer <michael@niedermayer.cc>
	Wed, 10 Jul 2019 10:54:57 +0000 (12:54 +0200)
configure		patch \| blob \| history
libavutil/mips/generic_macros_msa.h		patch \| blob \| history