avcodec/videotoolboxenc: fix align issue

[ffmpeg] / libavcodec / aarch64 / vp8dsp_neon.S
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S

index c19ab0de0fd4ad2225c37bd1bc81a364a609befd..4bbf16d1a4c7f1db75bec8fa8785e03510b8d1a2 100644 (file)
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -4,27 +4,84 @@
   * Copyright (c) 2010 Rob Clark <rob@ti.com>
   * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
   * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
   *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
   *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
   *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * Lesser General Public License for more details.
   *
   * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
  #include "libavutil/aarch64/asm.S"
  #include "neon.S"
  
+function ff_vp8_luma_dc_wht_neon, export=1
+        ld1             {v0.4h - v3.4h}, [x1]
+        movi            v30.8h, #0
+
+        add             v4.4h,  v0.4h,  v3.4h
+        add             v6.4h,  v1.4h,  v2.4h
+        st1             {v30.8h}, [x1], #16
+        sub             v7.4h,  v1.4h,  v2.4h
+        sub             v5.4h,  v0.4h,  v3.4h
+        st1             {v30.8h}, [x1]
+        add             v0.4h,  v4.4h,  v6.4h
+        add             v1.4h,  v5.4h,  v7.4h
+        sub             v2.4h,  v4.4h,  v6.4h
+        sub             v3.4h,  v5.4h,  v7.4h
+
+        movi            v16.4h, #3
+
+        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
+
+        add             v0.4h,  v0.4h,  v16.4h
+
+        add             v4.4h,  v0.4h,  v3.4h
+        add             v6.4h,  v1.4h,  v2.4h
+        sub             v7.4h,  v1.4h,  v2.4h
+        sub             v5.4h,  v0.4h,  v3.4h
+        add             v0.4h,  v4.4h,  v6.4h
+        add             v1.4h,  v5.4h,  v7.4h
+        sub             v2.4h,  v4.4h,  v6.4h
+        sub             v3.4h,  v5.4h,  v7.4h
+
+        sshr            v0.4h,  v0.4h,  #3
+        sshr            v1.4h,  v1.4h,  #3
+        sshr            v2.4h,  v2.4h,  #3
+        sshr            v3.4h,  v3.4h,  #3
+
+        mov             x3,  #32
+        st1             {v0.h}[0],  [x0], x3
+        st1             {v1.h}[0],  [x0], x3
+        st1             {v2.h}[0],  [x0], x3
+        st1             {v3.h}[0],  [x0], x3
+        st1             {v0.h}[1],  [x0], x3
+        st1             {v1.h}[1],  [x0], x3
+        st1             {v2.h}[1],  [x0], x3
+        st1             {v3.h}[1],  [x0], x3
+        st1             {v0.h}[2],  [x0], x3
+        st1             {v1.h}[2],  [x0], x3
+        st1             {v2.h}[2],  [x0], x3
+        st1             {v3.h}[2],  [x0], x3
+        st1             {v0.h}[3],  [x0], x3
+        st1             {v1.h}[3],  [x0], x3
+        st1             {v2.h}[3],  [x0], x3
+        st1             {v3.h}[3],  [x0], x3
+
+        ret
+endfunc
+
  function ff_vp8_idct_add_neon, export=1
          ld1             {v0.8b - v3.8b},  [x1]
          mov             w4,  #20091
@@ -35,8 +92,8 @@ function ff_vp8_idct_add_neon, export=1
          smull           v27.4s, v3.4h,  v4.h[0]
          sqdmulh         v20.4h, v1.4h,  v4.h[1]
          sqdmulh         v23.4h, v3.4h,  v4.h[1]
-        sqshrn          v21.4h, v26.4s, #16
-        sqshrn          v22.4h, v27.4s, #16
+        shrn            v21.4h, v26.4s, #16
+        shrn            v22.4h, v27.4s, #16
          add             v21.4h, v21.4h, v1.4h
          add             v22.4h, v22.4h, v3.4h
  
@@ -60,44 +117,97 @@ function ff_vp8_idct_add_neon, export=1
          st1             {v29.16b},  [x1]
          sqdmulh         v21.4h,     v1.4h,  v4.h[1]
          sqdmulh         v23.4h,     v3.4h,  v4.h[1]
-        sqshrn          v20.4h,     v26.4s, #16
-        sqshrn          v22.4h,     v27.4s, #16
+        shrn            v20.4h,     v26.4s, #16
+        shrn            v22.4h,     v27.4s, #16
          add             v20.4h,     v20.4h, v1.4h
          add             v22.4h,     v22.4h, v3.4h
          add             v16.4h,     v0.4h,  v2.4h
          sub             v17.4h,     v0.4h,  v2.4h
  
          add             v18.4h,     v20.4h, v23.4h
-        ld1             {v24.d}[0], [x0],   x2
-        zip1            v16.2d,     v16.2d, v17.2d
-        sub             v19.4h,     v21.4h, v22.4h
-        ld1             {v25.d}[0], [x0],   x2
-        zip1            v18.2d,     v18.2d, v19.2d
-        add             v0.8h,      v16.8h, v18.8h
-        ld1             {v25.d}[1], [x0],   x2
-        sub             v1.8h,      v16.8h, v18.8h
-        ld1             {v24.d}[1], [x0],   x2
-        srshr           v0.8h,      v0.8h,  #3
-        trn1            v24.4s,     v24.4s, v25.4s
-        srshr           v1.8h,      v1.8h,  #3
+        ld1             {v24.s}[0], [x0],   x2
+        sub             v19.4h, v21.4h, v22.4h
+        ld1             {v25.s}[0], [x0],   x2
+        add             v0.4h,      v16.4h, v18.4h
+        add             v1.4h,      v17.4h, v19.4h
+        ld1             {v26.s}[0], [x0],   x2
+        sub             v3.4h,      v16.4h, v18.4h
+        sub             v2.4h,      v17.4h, v19.4h
+        ld1             {v27.s}[0], [x0],   x2
+        srshr           v0.4h,      v0.4h,  #3
+        srshr           v1.4h,      v1.4h,  #3
+        srshr           v2.4h,      v2.4h,  #3
+        srshr           v3.4h,      v3.4h,  #3
+
          sub             x0,  x0,  x2,  lsl #2
  
-        ext             v1.16b, v1.16b, v1.16b, #8
-        trn1            v3.2d,  v0.2d,  v1.2d
-        trn2            v0.2d,  v0.2d,  v1.2d
-        trn1            v1.8h,  v3.8h,  v0.8h
-        trn2            v3.8h,  v3.8h,  v0.8h
-        uzp1            v0.4s,  v1.4s,  v3.4s
-        uzp2            v1.4s,  v3.4s,  v1.4s
+        transpose_4x4H  v0, v1, v2, v3, v5, v6, v7, v16
  
          uaddw           v0.8h,  v0.8h, v24.8b
-        uaddw2          v1.8h,  v1.8h, v24.16b
+        uaddw           v1.8h,  v1.8h, v25.8b
+        uaddw           v2.8h,  v2.8h, v26.8b
+        uaddw           v3.8h,  v3.8h, v27.8b
          sqxtun          v0.8b,  v0.8h
-        sqxtun2         v0.16b, v1.8h
+        sqxtun          v1.8b,  v1.8h
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v3.8b,  v3.8h
+
          st1             {v0.s}[0],  [x0], x2
-        st1             {v0.s}[1],  [x0], x2
-        st1             {v0.s}[3],  [x0], x2
-        st1             {v0.s}[2],  [x0], x2
+        st1             {v1.s}[0],  [x0], x2
+        st1             {v2.s}[0],  [x0], x2
+        st1             {v3.s}[0],  [x0], x2
+
+        ret
+endfunc
+
+function ff_vp8_idct_dc_add4uv_neon, export=1
+        movi            v0.4h,  #0
+        mov             x3,     #32
+        ld1r            {v16.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v17.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v18.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v19.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ins             v16.d[1],  v17.d[0]
+        ins             v18.d[1],  v19.d[0]
+        mov             x3,  x0
+        srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
+        ld1             {v0.8b},   [x0], x2
+        srshr           v18.8h,    v18.8h,  #3
+        ld1             {v1.8b},   [x0], x2
+        uaddw           v20.8h,    v16.8h, v0.8b
+        ld1             {v2.8b},   [x0], x2
+        uaddw           v0.8h,     v16.8h, v1.8b
+        ld1             {v3.8b},   [x0], x2
+        uaddw           v22.8h,    v16.8h, v2.8b
+        ld1             {v4.8b},   [x0], x2
+        uaddw           v2.8h,     v16.8h, v3.8b
+        ld1             {v5.8b},   [x0], x2
+        uaddw           v24.8h,    v18.8h, v4.8b
+        ld1             {v6.8b},   [x0], x2
+        uaddw           v4.8h,     v18.8h, v5.8b
+        ld1             {v7.8b},   [x0], x2
+        uaddw           v26.8h,    v18.8h, v6.8b
+        sqxtun          v20.8b,    v20.8h
+        uaddw           v6.8h,     v18.8h, v7.8b
+        sqxtun          v21.8b,    v0.8h
+        sqxtun          v22.8b,    v22.8h
+        st1             {v20.8b},  [x3], x2
+        sqxtun          v23.8b,    v2.8h
+        st1             {v21.8b},  [x3], x2
+        sqxtun          v24.8b,    v24.8h
+        st1             {v22.8b},  [x3], x2
+        sqxtun          v25.8b,    v4.8h
+        st1             {v23.8b},  [x3], x2
+        sqxtun          v26.8b,    v26.8h
+        st1             {v24.8b},  [x3], x2
+        sqxtun          v27.8b,    v6.8h
+        st1             {v25.8b},  [x3], x2
+        st1             {v26.8b},  [x3], x2
+        st1             {v27.8b},  [x3], x2
  
          ret
  endfunc
@@ -660,23 +770,6 @@ endfunc
          sqrshrun2       \d0\().16b, v22.8h, #7
  .endm
  
-.macro  vp8_epel8_v6    d0,  s0,  s1,  s2, s3, s4, s5
-        uxtl            \s2\().8h, \s2\().8b
-        uxtl            \s3\().8h, \s3\().8b
-        uxtl            \s1\().8h, \s1\().8b
-        uxtl            \s4\().8h, \s4\().8b
-        uxtl            \s0\().8h, \s0\().8b
-        uxtl            \s5\().8h, \s5\().8b
-        mul             \s2\().8h, \s2\().8h, v0.h[2]
-        mul             \s3\().8h, \s3\().8h, v0.h[3]
-        mls             \s2\().8h, \s1\().8h, v0.h[1]
-        mls             \s3\().8h, \s4\().8h, v0.h[4]
-        mla             \s2\().8h, \s0\().8h, v0.h[0]
-        mla             \s3\().8h, \s5\().8h, v0.h[5]
-        sqadd           \s3\().8h, \s2\().8h, \s3\().8h
-        sqrshrun        \d0\().8b, \s3\().8h, #7
-.endm
-
  .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
          uxtl            \s0\().8h, \s0\().8b
          uxtl            \s3\().8h, \s3\().8b
@@ -743,7 +836,7 @@ endfunc
  
  
  // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
-// arithmatic can be used to apply filters
+// arithmetic can be used to apply filters
  const   subpel_filters, align=4
          .short     0,   6, 123,  12,   1,   0,   0,   0
          .short     2,  11, 108,  36,   8,   1,   0,   0
@@ -833,21 +926,69 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
  2:
          ld1             {v1.8b - v4.8b},    [x7], #32
          ld1             {v16.8b - v19.8b},  [x7], #32
-        ld1             {v20.8b - v23.8b},  [x7]
-        sub             x7,  x7,  #48
+        ld1             {v20.8b - v23.8b},  [x7], #32
+        ld1             {v24.8b - v25.8b},  [x7]
+        sub             x7,  x7,  #64
  
-        vp8_epel8_v6    v5, v1, v3, v16, v18, v20, v22
-        vp8_epel8_v6    v2, v2, v4, v17, v19, v21, v23
-        trn1            v2.2d, v5.2d, v2.2d
+        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+        trn1            v1.2d, v1.2d, v2.2d
+        trn1            v3.2d, v3.2d, v4.2d
  
-        st1             {v2.16b}, [x0], x1
-        subs            x4, x4, #1
+        st1             {v1.16b}, [x0], x1
+        st1             {v3.16b}, [x0], x1
+        subs            x4, x4, #2
          b.ne            2b
  
          add             sp,  sp,  #336+16
          ret
  endfunc
  
+function ff_put_vp8_epel8_v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+
+        movrel          x7,  subpel_filters, -16
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},  [x6]
+1:
+        ld1             {v2.8b},  [x2], x3
+        ld1             {v3.8b},  [x2], x3
+        ld1             {v4.8b},  [x2], x3
+        ld1             {v5.8b},  [x2], x3
+        ld1             {v6.8b},  [x2], x3
+        ld1             {v7.8b},  [x2], x3
+        ld1             {v28.8b}, [x2]
+
+        sub             x2,  x2,  x3,  lsl #2
+
+        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
+
+        st1             {v2.8b}, [x0], x1
+        st1             {v3.8b}, [x0], x1
+        subs            w4,  w4,  #2
+        b.ne            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel8_h6_neon, export=1
+        sub             x2,  x2,  #2
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},        [x5]
+1:
+        ld1             {v2.8b, v3.8b}, [x2], x3
+
+        vp8_epel8_h6    v2,  v2,  v3
+
+        st1             {v2.8b}, [x0], x1
+        subs            w4,  w4,  #1
+        b.ne            1b
+
+        ret
+endfunc
+
  function ff_put_vp8_epel8_h6v6_neon, export=1
          sub             x2,  x2,  x3,  lsl #1
          sub             x2,  x2,  #2
@@ -894,6 +1035,48 @@ function ff_put_vp8_epel8_h6v6_neon, export=1
          ret
  endfunc
  
+function ff_put_vp8_epel8_v4_neon, export=1
+        sub             x2,  x2,  x3
+
+        movrel          x7,  subpel_filters, -16
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},     [x6]
+1:
+        ld1             {v2.8b},     [x2], x3
+        ld1             {v3.8b},     [x2], x3
+        ld1             {v4.8b},     [x2], x3
+        ld1             {v5.8b},     [x2], x3
+        ld1             {v6.8b},     [x2]
+        sub             x2,  x2,  x3,  lsl #1
+
+        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
+
+        st1             {v2.d}[0], [x0], x1
+        st1             {v2.d}[1], [x0], x1
+        subs            w4,  w4,  #2
+        b.ne            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel8_h4_neon, export=1
+        sub             x2,  x2,  #1
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},       [x5]
+1:
+        ld1             {v2.8b,v3.8b}, [x2], x3
+
+        vp8_epel8_h4    v2,  v2,  v3
+
+        st1             {v2.8b}, [x0], x1
+        subs            w4,  w4,  #1
+        b.ne            1b
+
+        ret
+endfunc
+
  function ff_put_vp8_epel8_h4v6_neon, export=1
          sub             x2,  x2,  x3,  lsl #1
          sub             x2,  x2,  #1
@@ -1029,3 +1212,579 @@ function ff_put_vp8_epel8_h6v4_neon, export=1
          add             sp,  sp,  #168+16
          ret
  endfunc
+
+function ff_put_vp8_epel4_v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+
+        movrel          x7,  subpel_filters, -16
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},    [x6]
+1:
+        ld1r            {v2.2s},    [x2], x3
+        ld1r            {v3.2s},    [x2], x3
+        ld1r            {v4.2s},    [x2], x3
+        ld1r            {v5.2s},    [x2], x3
+        ld1r            {v6.2s},    [x2], x3
+        ld1r            {v7.2s},    [x2], x3
+        ld1r            {v28.2s},   [x2]
+        sub             x2,  x2,  x3,  lsl #2
+        ld1             {v2.s}[1],  [x2], x3
+        ld1             {v3.s}[1],  [x2], x3
+        ld1             {v4.s}[1],  [x2], x3
+        ld1             {v5.s}[1],  [x2], x3
+        ld1             {v6.s}[1],  [x2], x3
+        ld1             {v7.s}[1],  [x2], x3
+        ld1             {v28.s}[1], [x2]
+        sub             x2,  x2,  x3,  lsl #2
+
+        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
+
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v3.s}[0],  [x0], x1
+        st1             {v2.s}[1],  [x0], x1
+        st1             {v3.s}[1],  [x0], x1
+        subs            w4,  w4,  #4
+        b.ne            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel4_h6_neon, export=1
+        sub             x2,  x2,  #2
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},       [x5]
+1:
+        ld1             {v2.8b,v3.8b}, [x2], x3
+        vp8_epel8_h6    v2,  v2,  v3
+        st1             {v2.s}[0], [x0], x1
+        subs            w4,  w4,  #1
+        b.ne            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel4_h6v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+        sub             x2,  x2,  #2
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},       [x5]
+
+        sub             sp,  sp,  #52
+        add             w8,  w4,  #5
+        mov             x9,  sp
+1:
+        ld1             {v2.8b,v3.8b}, [x2], x3
+        vp8_epel8_h6    v2,  v2,  v3
+        st1             {v2.s}[0],     [x9], #4
+        subs            w8,  w8,  #1
+        b.ne            1b
+
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},       [x6]
+        mov             x9,  sp
+2:
+        ld1             {v2.8b,v3.8b}, [x9], #16
+        ld1             {v6.8b},       [x9], #8
+        ld1r            {v28.2s},      [x9]
+        sub             x9,  x9,  #16
+        ld1             {v4.8b,v5.8b}, [x9], #16
+        ld1             {v7.8b},       [x9], #8
+        ld1             {v28.s}[1],    [x9]
+        sub             x9,  x9,  #16
+        trn1            v1.2s, v2.2s, v4.2s
+        trn2            v4.2s, v2.2s, v4.2s
+        trn1            v2.2s, v3.2s, v5.2s
+        trn2            v5.2s, v3.2s, v5.2s
+        trn1            v3.2s, v6.2s, v7.2s
+        trn2            v7.2s, v6.2s, v7.2s
+        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v3.s}[0],  [x0], x1
+        st1             {v2.s}[1],  [x0], x1
+        st1             {v3.s}[1],  [x0], x1
+        subs            w4,  w4,  #4
+        b.ne            2b
+
+        add             sp,  sp,  #52
+        ret
+endfunc
+
+function ff_put_vp8_epel4_h4v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+        sub             x2,  x2,  #1
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},       [x5]
+
+        sub             sp,  sp,  #52
+        add             w8,  w4,  #5
+        mov             x9,  sp
+1:
+        ld1             {v2.8b},       [x2], x3
+        vp8_epel8_h4    v2,  v2,  v2
+        st1             {v2.s}[0],     [x9], #4
+        subs            w8,  w8,  #1
+        b.ne            1b
+
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},       [x6]
+        mov             x9,  sp
+2:
+        ld1             {v2.8b,v3.8b}, [x9], #16
+        ld1             {v6.8b},       [x9], #8
+        ld1r            {v28.2s},      [x9]
+        sub             x9,  x9,  #16
+        ld1             {v4.8b,v5.8b}, [x9], #16
+        ld1             {v7.8b},       [x9], #8
+        ld1             {v28.s}[1],    [x9]
+        sub             x9,  x9,  #16
+        trn1            v1.2s, v2.2s, v4.2s
+        trn2            v4.2s, v2.2s, v4.2s
+        trn1            v2.2s, v3.2s, v5.2s
+        trn2            v5.2s, v3.2s, v5.2s
+        trn1            v3.2s, v6.2s, v7.2s
+        trn2            v7.2s, v6.2s, v7.2s
+        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v3.s}[0],  [x0], x1
+        st1             {v2.s}[1],  [x0], x1
+        st1             {v3.s}[1],  [x0], x1
+        subs            w4,  w4,  #4
+        b.ne            2b
+
+        add             sp,  sp,  #52
+        ret
+endfunc
+
+function ff_put_vp8_epel4_h6v4_neon, export=1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #2
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},       [x5]
+
+        sub             sp,  sp,  #44
+        add             w8,  w4,  #3
+        mov             x9,  sp
+1:
+        ld1             {v2.8b,v3.8b}, [x2], x3
+        vp8_epel8_h6    v2, v2, v3
+        st1             {v2.s}[0],     [x9], #4
+        subs            w8,  w8,  #1
+        b.ne            1b
+
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},       [x6]
+        mov             x9,  sp
+2:
+        ld1             {v2.8b,v3.8b}, [x9], #16
+        ld1r            {v6.2s},       [x9]
+        sub             x9,  x9,  #8
+        ld1             {v4.8b,v5.8b}, [x9], #16
+        ld1             {v6.s}[1],     [x9]
+        sub             x9,  x9,  #8
+        trn1            v1.2s, v2.2s, v4.2s
+        trn2            v4.2s, v2.2s, v4.2s
+        trn1            v2.2s, v3.2s, v5.2s
+        trn2            v5.2s, v3.2s, v5.2s
+        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
+        st1             {v1.s}[0],  [x0], x1
+        st1             {v1.s}[2],  [x0], x1
+        st1             {v1.s}[1],  [x0], x1
+        st1             {v1.s}[3],  [x0], x1
+        subs            w4,  w4,  #4
+        b.ne            2b
+
+        add             sp,  sp,  #44
+        ret
+endfunc
+
+function ff_put_vp8_epel4_h4_neon, export=1
+        sub             x2,  x2,  #1
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},    [x5]
+1:
+        ld1             {v2.8b},    [x2], x3
+        vp8_epel8_h4    v2,  v2,  v2
+        st1             {v2.s}[0],  [x0], x1
+        subs            w4,  w4,  #1
+        b.ne            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel4_v4_neon, export=1
+        sub             x2,  x2,  x3
+
+        movrel          x7,  subpel_filters, -16
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},   [x6]
+1:
+        ld1r            {v2.2s},   [x2], x3
+        ld1r            {v3.2s},   [x2], x3
+        ld1r            {v4.2s},   [x2], x3
+        ld1r            {v5.2s},   [x2], x3
+        ld1r            {v6.2s},   [x2]
+        sub             x2,  x2,  x3,  lsl #1
+        ld1             {v2.s}[1], [x2], x3
+        ld1             {v3.s}[1], [x2], x3
+        ld1             {v4.s}[1], [x2], x3
+        ld1             {v5.s}[1], [x2], x3
+        ld1             {v6.s}[1], [x2]
+        sub             x2,  x2,  x3,  lsl #1
+
+        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
+
+        st1             {v2.s}[0], [x0], x1
+        st1             {v2.s}[2], [x0], x1
+        st1             {v2.s}[1], [x0], x1
+        st1             {v2.s}[3], [x0], x1
+        subs            w4,  w4,  #4
+        b.ne            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel4_h4v4_neon, export=1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #1
+
+        movrel          x7,  subpel_filters, -16
+        add             x5,  x7,  w5, uxtw #4
+        ld1             {v0.8h},       [x5]
+
+        sub             sp,  sp,  #44
+        add             w8,  w4,  #3
+        mov             x9,  sp
+1:
+        ld1             {v2.8b},       [x2], x3
+        vp8_epel8_h4    v2,  v2,  v3
+        st1             {v2.s}[0],     [x9], #4
+        subs            w8,  w8,  #1
+        b.ne            1b
+
+        add             x6,  x7,  w6, uxtw #4
+        ld1             {v0.8h},       [x6]
+        mov             x9,  sp
+2:
+        ld1             {v2.8b,v3.8b}, [x9], #16
+        ld1r            {v6.2s},       [x9]
+        sub             x9,  x9,  #8
+        ld1             {v4.8b,v5.8b}, [x9], #16
+        ld1             {v6.s}[1],     [x9]
+        sub             x9,  x9,  #8
+        trn1            v1.2s, v2.2s, v4.2s
+        trn2            v4.2s, v2.2s, v4.2s
+        trn1            v2.2s, v3.2s, v5.2s
+        trn2            v5.2s, v3.2s, v5.2s
+        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
+        st1             {v1.s}[0], [x0], x1
+        st1             {v1.s}[2], [x0], x1
+        st1             {v1.s}[1], [x0], x1
+        st1             {v1.s}[3], [x0], x1
+        subs            w4,  w4,  #4
+        b.ne            2b
+
+        add             sp,  sp,  #44
+        ret
+endfunc
+
+/* Bilinear MC */
+
+function ff_put_vp8_bilin16_h_neon, export=1
+        mov             w7,     #8
+        dup             v0.8b,  w5
+        sub             w5,     w7,     w5
+        dup             v1.8b,  w5
+1:
+        subs            w4,     w4,     #2
+        ld1             {v2.8b,v3.8b,v4.8b},    [x2], x3
+        ext             v5.8b,  v3.8b,  v4.8b,  #1
+        ext             v4.8b,  v2.8b,  v3.8b,  #1
+        umull           v16.8h, v2.8b,  v1.8b
+        umlal           v16.8h, v4.8b,  v0.8b
+        ld1             {v18.8b,v19.8b,v20.8b}, [x2], x3
+        umull           v6.8h,  v3.8b,  v1.8b
+        umlal           v6.8h,  v5.8b,  v0.8b
+        ext             v21.8b, v19.8b, v20.8b, #1
+        ext             v20.8b, v18.8b, v19.8b, #1
+        umull           v22.8h, v18.8b, v1.8b
+        umlal           v22.8h, v20.8b, v0.8b
+        umull           v24.8h, v19.8b, v1.8b
+        umlal           v24.8h, v21.8b, v0.8b
+        rshrn           v4.8b,  v16.8h, #3
+        rshrn2          v4.16b, v6.8h,  #3
+        rshrn           v6.8b,  v22.8h, #3
+        rshrn2          v6.16b, v24.8h, #3
+        st1             {v4.16b}, [x0], x1
+        st1             {v6.16b}, [x0], x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin16_v_neon, export=1
+        mov             w7,     #8
+        dup             v0.16b, w6
+        sub             w6,     w7,     w6
+        dup             v1.16b, w6
+
+        ld1             {v2.16b}, [x2], x3
+1:
+        subs            w4,     w4,     #2
+        ld1             {v4.16b}, [x2], x3
+        umull           v6.8h,  v2.8b,  v1.8b
+        umlal           v6.8h,  v4.8b,  v0.8b
+        umull2          v16.8h, v2.16b, v1.16b
+        umlal2          v16.8h, v4.16b, v0.16b
+        ld1             {v2.16b}, [x2], x3
+        umull           v18.8h, v4.8b,  v1.8b
+        umlal           v18.8h, v2.8b,  v0.8b
+        umull2          v20.8h, v4.16b, v1.16b
+        umlal2          v20.8h, v2.16b, v0.16b
+        rshrn           v4.8b,  v6.8h,  #3
+        rshrn2          v4.16b, v16.8h, #3
+        rshrn           v6.8b,  v18.8h, #3
+        rshrn2          v6.16b, v20.8h, #3
+        st1             {v4.16b}, [x0], x1
+        st1             {v6.16b}, [x0], x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin16_hv_neon, export=1
+        mov             w7,      #8
+        dup             v0.8b,   w5            // mx
+        sub             w5,      w7,     w5
+        dup             v1.8b,   w5
+        dup             v2.16b,  w6            // my
+        sub             w6,      w7,     w6
+        dup             v3.16b,  w6
+
+        ld1             {v4.8b,v5.8b,v6.8b},    [x2], x3
+
+        ext             v7.8b,   v5.8b,  v6.8b, #1
+        ext             v6.8b,   v4.8b,  v5.8b, #1
+        umull           v16.8h,  v4.8b,  v1.8b
+        umlal           v16.8h,  v6.8b,  v0.8b
+        umull           v18.8h,  v5.8b,  v1.8b
+        umlal           v18.8h,  v7.8b,  v0.8b
+        rshrn           v4.8b,   v16.8h, #3
+        rshrn2          v4.16b,  v18.8h, #3
+1:
+        subs            w4,  w4,  #2
+        ld1             {v18.8b,v19.8b,v20.8b},  [x2], x3
+        ext             v21.8b,  v19.8b, v20.8b, #1
+        ext             v20.8b,  v18.8b, v19.8b, #1
+        umull           v22.8h,  v18.8b, v1.8b
+        umlal           v22.8h,  v20.8b, v0.8b
+        ld1             {v26.8b,v27.8b,v28.8b},  [x2], x3
+        umull           v24.8h,  v19.8b, v1.8b
+        umlal           v24.8h,  v21.8b, v0.8b
+        ext             v29.8b,  v27.8b, v28.8b, #1
+        ext             v28.8b,  v26.8b, v27.8b, #1
+        umull           v16.8h,  v26.8b, v1.8b
+        umlal           v16.8h,  v28.8b, v0.8b
+        umull           v18.8h,  v27.8b, v1.8b
+        umlal           v18.8h,  v29.8b, v0.8b
+        rshrn           v6.8b,   v22.8h, #3
+        rshrn2          v6.16b,  v24.8h, #3
+        umull           v24.8h,  v4.8b,  v3.8b
+        umlal           v24.8h,  v6.8b,  v2.8b
+        umull2          v30.8h,  v4.16b, v3.16b
+        umlal2          v30.8h,  v6.16b, v2.16b
+        rshrn           v4.8b,   v16.8h, #3
+        rshrn2          v4.16b,  v18.8h, #3
+        umull           v20.8h,  v6.8b,  v3.8b
+        umlal           v20.8h,  v4.8b,  v2.8b
+        umull2          v22.8h,  v6.16b, v3.16b
+        umlal2          v22.8h,  v4.16b, v2.16b
+        rshrn           v24.8b,  v24.8h, #3
+        rshrn2          v24.16b, v30.8h, #3
+        st1             {v24.16b}, [x0], x1
+        rshrn           v20.8b,  v20.8h, #3
+        rshrn2          v20.16b, v22.8h, #3
+        st1             {v20.16b}, [x0], x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin8_h_neon, export=1
+        mov             w7,     #8
+        dup             v0.8b,  w5
+        sub             w5,     w7,     w5
+        dup             v1.8b,  w5
+1:
+        subs            w4,     w4,     #2
+        ld1             {v2.8b,v3.8b},  [x2],  x3
+        ext             v3.8b,  v2.8b,  v3.8b, #1
+        umull           v4.8h,  v2.8b,  v1.8b
+        umlal           v4.8h,  v3.8b,  v0.8b
+        ld1             {v6.8b,v7.8b},  [x2],  x3
+        ext             v7.8b,  v6.8b,  v7.8b, #1
+        umull           v16.8h, v6.8b,  v1.8b
+        umlal           v16.8h, v7.8b,  v0.8b
+        rshrn           v4.8b,  v4.8h,  #3
+        rshrn           v16.8b, v16.8h, #3
+        st1             {v4.8b},  [x0], x1
+        st1             {v16.8b}, [x0], x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin8_v_neon, export=1
+        mov             w7,      #8
+        dup             v0.8b,   w6
+        sub             w6,      w7,    w6
+        dup             v1.8b,   w6
+
+        ld1             {v2.8b}, [x2],  x3
+1:
+        subs            w4,      w4,    #2
+        ld1             {v3.8b}, [x2],  x3
+        umull           v4.8h,   v2.8b, v1.8b
+        umlal           v4.8h,   v3.8b, v0.8b
+        ld1             {v2.8b}, [x2],  x3
+        umull           v6.8h,   v3.8b, v1.8b
+        umlal           v6.8h,   v2.8b, v0.8b
+        rshrn           v4.8b,   v4.8h, #3
+        rshrn           v6.8b,   v6.8h, #3
+        st1             {v4.8b}, [x0],  x1
+        st1             {v6.8b}, [x0],  x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin8_hv_neon, export=1
+        mov             w7,     #8
+        dup             v0.8b,  w5             // mx
+        sub             w5,     w7,     w5
+        dup             v1.8b,  w5
+        dup             v2.8b,  w6             // my
+        sub             w6,     w7,     w6
+        dup             v3.8b,  w6
+
+        ld1             {v4.8b,v5.8b},  [x2],  x3
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        umull           v18.8h, v4.8b,  v1.8b
+        umlal           v18.8h, v5.8b,  v0.8b
+        rshrn           v22.8b, v18.8h, #3
+1:
+        subs            w4,     w4,     #2
+        ld1             {v6.8b,v7.8b},  [x2],  x3
+        ext             v7.8b,  v6.8b,  v7.8b, #1
+        umull           v16.8h, v6.8b,  v1.8b
+        umlal           v16.8h, v7.8b,  v0.8b
+        ld1             {v4.8b,v5.8b},  [x2],  x3
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        umull           v18.8h, v4.8b,  v1.8b
+        umlal           v18.8h, v5.8b,  v0.8b
+        rshrn           v16.8b, v16.8h, #3
+        umull           v20.8h, v22.8b, v3.8b
+        umlal           v20.8h, v16.8b, v2.8b
+        rshrn           v22.8b, v18.8h, #3
+        umull           v24.8h, v16.8b, v3.8b
+        umlal           v24.8h, v22.8b, v2.8b
+        rshrn           v20.8b, v20.8h, #3
+        st1             {v20.8b}, [x0], x1
+        rshrn           v23.8b, v24.8h, #3
+        st1             {v23.8b}, [x0], x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin4_h_neon, export=1
+        mov             w7,      #8
+        dup             v0.8b,   w5
+        sub             w5,      w7,     w5
+        dup             v1.8b,   w5
+1:
+        subs            w4,      w4,     #2
+        ld1             {v2.8b}, [x2],   x3
+        ext             v3.8b,   v2.8b,  v3.8b,  #1
+        ld1             {v6.8b}, [x2],   x3
+        ext             v7.8b,   v6.8b,  v7.8b,  #1
+        trn1            v2.2s,   v2.2s,  v6.2s
+        trn1            v3.2s,   v3.2s,  v7.2s
+        umull           v4.8h,   v2.8b,  v1.8b
+        umlal           v4.8h,   v3.8b,  v0.8b
+        rshrn           v4.8b,   v4.8h,  #3
+        st1             {v4.s}[0], [x0], x1
+        st1             {v4.s}[1], [x0], x1
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin4_v_neon, export=1
+        mov             w7,     #8
+        dup             v0.8b,  w6
+        sub             w6,     w7,  w6
+        dup             v1.8b,  w6
+
+        ld1r            {v2.2s},    [x2], x3
+1:
+        ld1r            {v3.2s},   [x2]
+        ld1             {v2.s}[1], [x2], x3
+        ld1             {v3.s}[1], [x2], x3
+        umull           v4.8h,  v2.8b,  v1.8b
+        umlal           v4.8h,  v3.8b,  v0.8b
+        trn2            v2.2s,  v3.2s,  v2.2s
+        rshrn           v4.8b,  v4.8h,  #3
+        st1             {v4.s}[0], [x0], x1
+        st1             {v4.s}[1], [x0], x1
+        subs            w4,     w4,     #2
+        b.gt            1b
+
+        ret
+endfunc
+
+function ff_put_vp8_bilin4_hv_neon, export=1
+        mov             w7,      #8
+        dup             v0.8b,   w5             // mx
+        sub             w5,      w7,     w5
+        dup             v1.8b,   w5
+        dup             v2.8b,   w6             // my
+        sub             w6,      w7,     w6
+        dup             v3.8b,   w6
+
+        ld1             {v4.8b}, [x2],   x3
+        ext             v5.8b,   v4.8b,  v4.8b,  #1
+        umull           v18.8h,  v4.8b,  v1.8b
+        umlal           v18.8h,  v5.8b,  v0.8b
+        rshrn           v22.8b,  v18.8h, #3
+1:
+        subs            w4,      w4,     #2
+        ld1             {v6.8b}, [x2],   x3
+        ext             v7.8b,   v6.8b,  v6.8b,  #1
+        ld1             {v4.8b}, [x2],   x3
+        ext             v5.8b,   v4.8b,  v4.8b,  #1
+        trn1            v6.2s,   v6.2s,  v4.2s
+        trn1            v7.2s,   v7.2s,  v5.2s
+        umull           v16.8h,  v6.8b,  v1.8b
+        umlal           v16.8h,  v7.8b,  v0.8b
+        rshrn           v16.8b,  v16.8h, #3
+        umull           v20.8h,  v16.8b, v2.8b
+        trn1            v22.2s,  v22.2s, v16.2s
+        umlal           v20.8h,  v22.8b, v3.8b
+        rev64           v22.2s,  v16.2s
+        rshrn           v20.8b,  v20.8h, #3
+        st1             {v20.s}[0], [x0], x1
+        st1             {v20.s}[1], [x0], x1
+        b.gt            1b
+
+        ret
+endfunc