From: Janne Grunau <janne-x264@jannau.net>
Date: Tue, 18 Aug 2015 08:25:10 +0000 (+0200)
Subject: aarch64: Faster intra_predict_4x4_h
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;ds=sidebyside;h=b16268ac0826d78455d0d704ea0fc8b1edc6b6bf;p=x264

aarch64: Faster intra_predict_4x4_h

Use multiplication with 0x01010101 for splats.

On a cortex-a53:
                     gcc 4.9.2   llvm 3.6   neon (before)   neon (after)
intra_predict_4x4_h: 162         147        160/155         139/135
---

diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index a01016ae..a7dd2d1c 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -63,22 +63,19 @@ endconst
 
 
 function x264_predict_4x4_h_aarch64, export=1
-    ldrb    w1, [x0, #0*FDEC_STRIDE-1]
-    ldrb    w2, [x0, #1*FDEC_STRIDE-1]
-    ldrb    w3, [x0, #2*FDEC_STRIDE-1]
-    ldrb    w4, [x0, #3*FDEC_STRIDE-1]
-    add     w1, w1, w1, lsl #8
-    add     w2, w2, w2, lsl #8
-    add     w3, w3, w3, lsl #8
-    add     w4, w4, w4, lsl #8
-    add     w1, w1, w1, lsl #16
-    str     w1, [x0, #0*FDEC_STRIDE]
-    add     w2, w2, w2, lsl #16
-    str     w2, [x0, #1*FDEC_STRIDE]
-    add     w3, w3, w3, lsl #16
-    str     w3, [x0, #2*FDEC_STRIDE]
-    add     w4, w4, w4, lsl #16
-    str     w4, [x0, #3*FDEC_STRIDE]
+    ldrb    w1,  [x0, #0*FDEC_STRIDE-1]
+    mov     w5,  #0x01010101
+    ldrb    w2,  [x0, #1*FDEC_STRIDE-1]
+    ldrb    w3,  [x0, #2*FDEC_STRIDE-1]
+    mul     w1,  w1,  w5
+    ldrb    w4,  [x0, #3*FDEC_STRIDE-1]
+    mul     w2,  w2,  w5
+    str     w1,  [x0, #0*FDEC_STRIDE]
+    mul     w3,  w3,  w5
+    str     w2,  [x0, #1*FDEC_STRIDE]
+    mul     w4,  w4,  w5
+    str     w3,  [x0, #2*FDEC_STRIDE]
+    str     w4,  [x0, #3*FDEC_STRIDE]
     ret
 endfunc