aarch64: {plane_copy,memcpy_aligned,memzero_aligned}_neon

author Janne Grunau <janne-x264@jannau.net>

Fri, 31 Oct 2014 13:49:04 +0000 (14:49 +0100)

committer Anton Mitrofanov <BugMaster@narod.ru>

Tue, 16 Dec 2014 17:40:09 +0000 (20:40 +0300)
author Janne Grunau <janne-x264@jannau.net>
Fri, 31 Oct 2014 13:49:04 +0000 (14:49 +0100)
committer Anton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:40:09 +0000 (20:40 +0300)
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S

index 84074516832a59c5f9331a542cf0d530d0f7090d..324ef16939b8b05377f4e5c357f7440072011c8d 100644 (file)
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1253,6 +1253,34 @@ load_deinterleave_chroma:
      ret
  endfunc
  
+function x264_plane_copy_neon, export=1
+    add         x8,  x4,  #15
+    and         x4,  x8,  #~15
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+16:
+    tst         w8,  #16
+    b.eq        32f
+    subs        w8,  w8,  #16
+    ldr         q0,  [x2], #16
+    str         q0,  [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ldp         q0,  q1,  [x2], #32
+    stp         q0,  q1,  [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
  function x264_plane_copy_deinterleave_neon, export=1
      add         w9,  w6,  #15
      and         w9,  w9,  #0xfffffff0
@@ -1601,3 +1629,41 @@ function x264_mbtree_propagate_list_internal_neon, export=1
      b.ge        8b
      ret
  endfunc
+
+function x264_memcpy_aligned_neon, export=1
+    tst         x2,  #16
+    b.eq        32f
+    sub         x2,  x2,  #16
+    ldr         q0,  [x1], #16
+    str         q0,  [x0], #16
+32:
+    tst         x2,  #32
+    b.eq        640f
+    sub         x2,  x2,  #32
+    ldp         q0,  q1,  [x1], #32
+    stp         q0,  q1,  [x0], #32
+640:
+    cbz         x2,  1f
+64:
+    subs        x2,  x2,  #64
+    ldp         q0,  q1,  [x1, #32]
+    ldp         q2,  q3,  [x1], #64
+    stp         q0,  q1,  [x0, #32]
+    stp         q2,  q3,  [x0], #64
+    b.gt        64b
+1:
+    ret
+endfunc
+
+function x264_memzero_aligned_neon, export=1
+    movi        v0.16b,  #0
+    movi        v1.16b,  #0
+1:
+    subs        x1,  x1,  #128
+    stp         q0,  q1,  [x0, #96]
+    stp         q0,  q1,  [x0, #64]
+    stp         q0,  q1,  [x0, #32]
+    stp         q0,  q1,  [x0], 128
+    b.gt        1b
+    ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c

index 96582d455a6aaf62b4a453c0f501f75b9d4d37de..25ebea49e8f1b7d0e65c98e54ed74e09693a3d1c 100644 (file)
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -49,6 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
  void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+                           pixel *src, intptr_t i_src, int w, int h );
  void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                           pixel *dstv, intptr_t i_dstv,
                                           pixel *src,  intptr_t i_src, int w, int h );
@@ -304,6 +306,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
      pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
      pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
  
+    pf->plane_copy                  = x264_plane_copy_neon;
      pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
      pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
      pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;
@@ -340,5 +343,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
  
      pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
      pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+    pf->memzero_aligned = x264_memzero_aligned_neon;
  #endif // !HIGH_BIT_DEPTH
  }
author	Janne Grunau <janne-x264@jannau.net>
	Fri, 31 Oct 2014 13:49:04 +0000 (14:49 +0100)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Tue, 16 Dec 2014 17:40:09 +0000 (20:40 +0300)
common/aarch64/mc-a.S		patch \| blob \| history
common/aarch64/mc-c.c		patch \| blob \| history