aarch64: NEON asm for 4x16 sad, satd and ssd

author Janne Grunau <janne-x264@jannau.net>

Thu, 7 Aug 2014 14:49:12 +0000 (16:49 +0200)

committer Anton Mitrofanov <BugMaster@narod.ru>

Tue, 16 Dec 2014 17:39:55 +0000 (20:39 +0300)
author Janne Grunau <janne-x264@jannau.net>
Thu, 7 Aug 2014 14:49:12 +0000 (16:49 +0200)
committer Anton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:39:55 +0000 (20:39 +0300)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S

index d2c3de65c6e6681755bd473efb485a14d3425605..92912edd35a8bfe5a2c5d38c4fb05ee33b652812 100644 (file)
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -114,6 +114,7 @@ endfunc
  
  SAD_FUNC  4,  4
  SAD_FUNC  4,  8
+SAD_FUNC  4,  16
  SAD_FUNC  8,  4
  SAD_FUNC  8,  8
  SAD_FUNC  8,  16
@@ -367,6 +368,7 @@ endfunc
  
  SSD_FUNC   4, 4
  SSD_FUNC   4, 8
+SSD_FUNC   4, 16
  SSD_FUNC   8, 4
  SSD_FUNC   8, 8
  SSD_FUNC   8, 16
@@ -895,6 +897,61 @@ function x264_satd_16x4_neon
      b           x264_satd_8x4v_8x8h_neon
  endfunc
  
+function x264_pixel_satd_4x16_neon, export=1
+    mov         x4,  x30
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v16.8h, v0.8b,  v1.8b
+    usubl       v17.8h, v2.8b,  v3.8b
+    usubl       v18.8h, v4.8b,  v5.8b
+    usubl       v19.8h, v6.8b,  v7.8b
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v20.8h, v0.8b,  v1.8b
+    usubl       v21.8h, v2.8b,  v3.8b
+    usubl       v22.8h, v4.8b,  v5.8b
+    usubl       v23.8h, v6.8b,  v7.8b
+
+    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
+
+    bl          x264_satd_8x4v_8x8h_neon
+
+    add         v30.8h, v0.8h,  v1.8h
+    add         v31.8h, v2.8h,  v3.8h
+    add         v0.8h,  v30.8h, v31.8h
+    uaddlv      s0,  v0.8h
+    mov         w0,  v0.s[0]
+    ret         x4
+endfunc
  
  function x264_pixel_sa8d_8x8_neon, export=1
      mov         x4,  x30
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h

index c7c386ae6ea56535acee5c94aa17eb67af252570..7d519644883fe4a02a26224dae0ec5d3b713543c 100644 (file)
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -33,6 +33,7 @@
      ret x264_pixel_##name##_8x16_##suffix args;\
      ret x264_pixel_##name##_8x8_##suffix args;\
      ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x16_##suffix args;\
      ret x264_pixel_##name##_4x8_##suffix args;\
      ret x264_pixel_##name##_4x4_##suffix args;\
  
diff --git a/common/pixel.c b/common/pixel.c

index bb1894a05b5284d3e15f5ce9d970a8698d6e443d..6bdbbca3122a7d7fc299aaa2a6a3afb1a829be0d 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1409,13 +1409,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  #if ARCH_AARCH64
      if( cpu&X264_CPU_NEON )
      {
-        INIT7( sad, _neon );
+        INIT8( sad, _neon );
          // AArch64 has no distinct instructions for aligned load/store
-        INIT7_NAME( sad_aligned, sad, _neon );
+        INIT8_NAME( sad_aligned, sad, _neon );
          INIT7( sad_x3, _neon );
          INIT7( sad_x4, _neon );
-        INIT7( ssd, _neon );
-        INIT7( satd, _neon );
+        INIT8( ssd, _neon );
+        INIT8( satd, _neon );
          INIT7( satd_x3, _neon );
          INIT7( satd_x4, _neon );
          INIT4( hadamard_ac, _neon );
author	Janne Grunau <janne-x264@jannau.net>
	Thu, 7 Aug 2014 14:49:12 +0000 (16:49 +0200)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Tue, 16 Dec 2014 17:39:55 +0000 (20:39 +0300)
common/aarch64/pixel-a.S		patch \| blob \| history
common/aarch64/pixel.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history