]> git.sesse.net Git - x264/commitdiff
aarch64: implement x264_pixel_vsad_neon
authorJanne Grunau <janne-x264@jannau.net>
Tue, 29 Jul 2014 17:26:11 +0000 (18:26 +0100)
committerAnton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:39:52 +0000 (20:39 +0300)
35 times faster than C.

common/aarch64/pixel-a.S
common/aarch64/pixel.h
common/pixel.c

index 8c7b9279dbc755e204f07ec53ae048487a3e09b5..efa708a7f1aae62aa0376680b96ab4526210f91e 100644 (file)
@@ -148,7 +148,7 @@ SAD_FUNC  16, 16
     \first      v17.8h,  v2.8b,  v0.8b
     ld1        {v3.8b}, [x3], x5
     ld1        {v1.8b}, [x1], x5
-   \first       v18.8h,  v3.8b,  v0.8b
+    \first      v18.8h,  v3.8b,  v0.8b
     uabal       v16.8h,  v1.8b,  v5.8b
     ld1        {v2.8b}, [x2], x5
     ld1        {v3.8b}, [x3], x5
@@ -248,6 +248,30 @@ SAD_X_FUNC  4, 16, 8
 SAD_X_FUNC  4, 16, 16
 
 
+function x264_pixel_vsad_neon, export=1
+    subs        w2,  w2,  #2
+    ld1        {v0.16b},  [x0],  x1
+    ld1        {v1.16b},  [x0],  x1
+    uabdl       v6.8h,  v0.8b,  v1.8b
+    uabdl2      v7.8h,  v0.16b, v1.16b
+    b.le        2f
+1:
+    subs        w2,  w2,  #2
+    ld1        {v0.16b},  [x0],  x1
+    uabal       v6.8h,  v1.8b,  v0.8b
+    uabal2      v7.8h,  v1.16b, v0.16b
+    ld1        {v1.16b},  [x0],  x1
+    b.lt        2f
+    uabal       v6.8h,  v0.8b,  v1.8b
+    uabal2      v7.8h,  v0.16b, v1.16b
+    b.gt        1b
+2:
+    add         v5.8h,  v6.8h,  v7.8h
+    uaddlv      s0,  v5.8h
+    fmov        w0,  s0
+    ret
+endfunc
+
 .macro SSD_START_4
     ld1        {v16.s}[0], [x0], x1
     ld1        {v17.s}[0], [x2], x3
index d4097eddf88b767e6d221252ab6945a99a7d1d17..c7cc6c9830381eeb214d0056901347818b96358e 100644 (file)
@@ -48,6 +48,8 @@ DECL_X4( sad, neon )
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
 
+int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
+
 int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
index 421a67c9902a5a6aaf863070e26b46fec4a9e194..d467151e2ef900b320ac01b968ae344ac4da9499 100644 (file)
@@ -1429,6 +1429,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
         pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+        pixf->vsad = x264_pixel_vsad_neon;
 
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;