arm: Implement x264_pixel_sa8d_satd_16x16_neon

author Martin Storsjö <martin@martin.st>

Tue, 25 Aug 2015 11:38:17 +0000 (14:38 +0300)

committer Henrik Gramner <henrik@gramner.com>

Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
author Martin Storsjö <martin@martin.st>
Tue, 25 Aug 2015 11:38:17 +0000 (14:38 +0300)
committer Henrik Gramner <henrik@gramner.com>
Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S

index bbe082d70b20abeecf023613b0cd60f3414428f7..41559b8daa6a9205e35a825cd56dfc37b841c865 100644 (file)
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1130,7 +1130,35 @@ endfunc
      SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
  .endm
  
      SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
  .endm
  
-function x264_sa8d_8x8_neon, export=0
+.macro integrated_satd dst, s0, s1, s2, s3
+    vmov            q0,  \s0
+    vmov            q1,  \s1
+    vmov            q2,  \s2
+    vmov            q3,  \s3
+
+    vtrn.16         q0,  q1
+    vtrn.16         q2,  q3
+
+    SUMSUB_AB       q6,  q7,  q0,  q1
+    SUMSUB_AB       q0,  q1,  q2,  q3
+
+    vtrn.32         q6,  q0
+    vtrn.32         q7,  q1
+
+    vabs.s16        q6,  q6
+    vabs.s16        q0,  q0
+    vabs.s16        q7,  q7
+    vabs.s16        q1,  q1
+
+    vmax.u16        q6,  q6,  q0
+    vmax.u16        q7,  q7,  q1
+
+    vadd.i16        q6,  q6,  q7
+    vpadal.u16      \dst,  q6
+.endm
+
+.macro sa8d_satd_8x8 satd=
+function x264_sa8d_\satd\()8x8_neon, export=0
      LOAD_DIFF_8x4   q8,  q9,  q10, q11
      vld1.64         {d7}, [r2], r3
      SUMSUB_AB       q0,  q1,  q8,  q9
      LOAD_DIFF_8x4   q8,  q9,  q10, q11
      vld1.64         {d7}, [r2], r3
      SUMSUB_AB       q0,  q1,  q8,  q9
@@ -1150,6 +1178,12 @@ function x264_sa8d_8x8_neon, export=0
      vsubl.u8        q15, d0,  d1
  
      HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
      vsubl.u8        q15, d0,  d1
  
      HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
+
+.ifc \satd, satd_
+    integrated_satd q4,  q8,  q9,  q10, q11
+    integrated_satd q4,  q12, q13, q14, q15
+.endif
+
      SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
      SUMSUB_AB       q2,  q10, q10, q14
      vtrn.16         q8,  q9
      SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
      SUMSUB_AB       q2,  q10, q10, q14
      vtrn.16         q8,  q9
@@ -1185,8 +1219,40 @@ function x264_sa8d_8x8_neon, export=0
      vmax.s16        q11, q3,  q15
      vadd.i16        q8,  q8,  q9
      vadd.i16        q9,  q10, q11
      vmax.s16        q11, q3,  q15
      vadd.i16        q8,  q8,  q9
      vadd.i16        q9,  q10, q11
+.ifc \satd, satd_
+    vpadal.u16      q5,  q8
+    vpadal.u16      q5,  q9
+.endif
      bx              lr
  endfunc
      bx              lr
  endfunc
+.endm
+
+sa8d_satd_8x8
+sa8d_satd_8x8 satd_
+
+function x264_pixel_sa8d_satd_16x16_neon
+    push            {lr}
+    vpush           {q4-q7}
+    vmov.u32        q4,  #0
+    vmov.u32        q5,  #0
+    bl              x264_sa8d_satd_8x8_neon
+    bl              x264_sa8d_satd_8x8_neon
+    sub             r0,  r0,  r1,  lsl #4
+    sub             r2,  r2,  r3,  lsl #4
+    add             r0,  r0,  #8
+    add             r2,  r2,  #8
+    bl              x264_sa8d_satd_8x8_neon
+    bl              x264_sa8d_satd_8x8_neon
+    vadd.u32        d1,  d10, d11
+    vadd.u32        d0,  d8,  d9
+    vpadd.u32       d1,  d1,  d1
+    vpadd.u32       d0,  d0,  d0
+    vrshr.u32       d1,  d1,  #1
+    vmov.32         r1,  d0[0]
+    vmov.32         r0,  d1[0]
+    vpop            {q4-q7}
+    pop             {pc}
+endfunc
  
  
  .macro HADAMARD_AC w h
  
  
  .macro HADAMARD_AC w h
diff --git a/common/arm/pixel.h b/common/arm/pixel.h

index 81c21dc8748d340f913429e921b7c791d9d3ef30..d84808be3f0e6e7dbb7b559cd6930638e8feb270 100644 (file)
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -58,6 +58,7 @@ int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
  
  int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
  int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
  
  int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
  int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
  
  uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
  uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
  
  uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
  uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c

index 9904b176b0329b61b53cc9f8029032ba069b7efc..7da03408c712d9b7404e970e0475923bedb666f8 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1375,6 +1375,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          INIT4( hadamard_ac, _neon );
          pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
          pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
          INIT4( hadamard_ac, _neon );
          pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
          pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
          pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
          pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
          pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
          pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
          pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
          pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
author	Martin Storsjö <martin@martin.st>
	Tue, 25 Aug 2015 11:38:17 +0000 (14:38 +0300)
committer	Henrik Gramner <henrik@gramner.com>
	Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
common/arm/pixel-a.S		patch \| blob \| history
common/arm/pixel.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history