git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9mc_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22
  23 // All public functions in this file have the following signature:
  24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
  25 //                            const uint8_t *ref, ptrdiff_t ref_stride,
  26 //                            int h, int mx, int my);
  27
  28 function ff_vp9_copy64_aarch64, export=1
  29 1:
  30         ldp             x5,  x6,  [x2]
  31         ldp             x7,  x8,  [x2, #16]
  32         stp             x5,  x6,  [x0]
  33         ldp             x9,  x10, [x2, #32]
  34         stp             x7,  x8,  [x0, #16]
  35         subs            w4,  w4,  #1
  36         ldp             x11, x12, [x2, #48]
  37         stp             x9,  x10, [x0, #32]
  38         stp             x11, x12, [x0, #48]
  39         add             x2,  x2,  x3
  40         add             x0,  x0,  x1
  41         b.ne            1b
  42         ret
  43 endfunc
  44
  45 function ff_vp9_avg64_neon, export=1
  46         mov             x5,  x0
  47 1:
  48         ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
  49         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
  50         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
  51         urhadd          v0.16b,  v0.16b,  v4.16b
  52         urhadd          v1.16b,  v1.16b,  v5.16b
  53         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
  54         urhadd          v2.16b,  v2.16b,  v6.16b
  55         urhadd          v3.16b,  v3.16b,  v7.16b
  56         subs            w4,  w4,  #2
  57         urhadd          v16.16b, v16.16b, v20.16b
  58         urhadd          v17.16b, v17.16b, v21.16b
  59         st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
  60         urhadd          v18.16b, v18.16b, v22.16b
  61         urhadd          v19.16b, v19.16b, v23.16b
  62         st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
  63         b.ne            1b
  64         ret
  65 endfunc
  66
  67 function ff_vp9_copy32_aarch64, export=1
  68 1:
  69         ldp             x5,  x6,  [x2]
  70         ldp             x7,  x8,  [x2, #16]
  71         stp             x5,  x6,  [x0]
  72         subs            w4,  w4,  #1
  73         stp             x7,  x8,  [x0, #16]
  74         add             x2,  x2,  x3
  75         add             x0,  x0,  x1
  76         b.ne            1b
  77         ret
  78 endfunc
  79
  80 function ff_vp9_avg32_neon, export=1
  81 1:
  82         ld1             {v2.16b, v3.16b},  [x2], x3
  83         ld1             {v0.16b, v1.16b},  [x0]
  84         urhadd          v0.16b,  v0.16b,  v2.16b
  85         urhadd          v1.16b,  v1.16b,  v3.16b
  86         subs            w4,  w4,  #1
  87         st1             {v0.16b, v1.16b},  [x0], x1
  88         b.ne            1b
  89         ret
  90 endfunc
  91
  92 function ff_vp9_copy16_neon, export=1
  93         add             x5,  x0,  x1
  94         lsl             x1,  x1,  #1
  95         add             x6,  x2,  x3
  96         lsl             x3,  x3,  #1
  97 1:
  98         ld1             {v0.16b},  [x2], x3
  99         ld1             {v1.16b},  [x6], x3
 100         ld1             {v2.16b},  [x2], x3
 101         ld1             {v3.16b},  [x6], x3
 102         subs            w4,  w4,  #4
 103         st1             {v0.16b},  [x0], x1
 104         st1             {v1.16b},  [x5], x1
 105         st1             {v2.16b},  [x0], x1
 106         st1             {v3.16b},  [x5], x1
 107         b.ne            1b
 108         ret
 109 endfunc
 110
 111 function ff_vp9_avg16_neon, export=1
 112         mov             x5,  x0
 113 1:
 114         ld1             {v2.16b},  [x2], x3
 115         ld1             {v0.16b},  [x0], x1
 116         ld1             {v3.16b},  [x2], x3
 117         urhadd          v0.16b,  v0.16b,  v2.16b
 118         ld1             {v1.16b},  [x0], x1
 119         urhadd          v1.16b,  v1.16b,  v3.16b
 120         subs            w4,  w4,  #2
 121         st1             {v0.16b},  [x5], x1
 122         st1             {v1.16b},  [x5], x1
 123         b.ne            1b
 124         ret
 125 endfunc
 126
 127 function ff_vp9_copy8_neon, export=1
 128 1:
 129         ld1             {v0.8b},  [x2], x3
 130         ld1             {v1.8b},  [x2], x3
 131         subs            w4,  w4,  #2
 132         st1             {v0.8b},  [x0], x1
 133         st1             {v1.8b},  [x0], x1
 134         b.ne            1b
 135         ret
 136 endfunc
 137
 138 function ff_vp9_avg8_neon, export=1
 139         mov             x5,  x0
 140 1:
 141         ld1             {v2.8b},  [x2], x3
 142         ld1             {v0.8b},  [x0], x1
 143         ld1             {v3.8b},  [x2], x3
 144         urhadd          v0.8b,  v0.8b,  v2.8b
 145         ld1             {v1.8b},  [x0], x1
 146         urhadd          v1.8b,  v1.8b,  v3.8b
 147         subs            w4,  w4,  #2
 148         st1             {v0.8b},  [x5], x1
 149         st1             {v1.8b},  [x5], x1
 150         b.ne            1b
 151         ret
 152 endfunc
 153
 154 function ff_vp9_copy4_neon, export=1
 155 1:
 156         ld1             {v0.s}[0], [x2], x3
 157         ld1             {v1.s}[0], [x2], x3
 158         st1             {v0.s}[0], [x0], x1
 159         ld1             {v2.s}[0], [x2], x3
 160         st1             {v1.s}[0], [x0], x1
 161         ld1             {v3.s}[0], [x2], x3
 162         subs            w4,  w4,  #4
 163         st1             {v2.s}[0], [x0], x1
 164         st1             {v3.s}[0], [x0], x1
 165         b.ne            1b
 166         ret
 167 endfunc
 168
 169 function ff_vp9_avg4_neon, export=1
 170         mov             x5,  x0
 171 1:
 172         ld1             {v2.s}[0], [x2], x3
 173         ld1             {v0.s}[0], [x0], x1
 174         ld1             {v2.s}[1], [x2], x3
 175         ld1             {v0.s}[1], [x0], x1
 176         ld1             {v3.s}[0], [x2], x3
 177         ld1             {v1.s}[0], [x0], x1
 178         ld1             {v3.s}[1], [x2], x3
 179         ld1             {v1.s}[1], [x0], x1
 180         subs            w4,  w4,  #4
 181         urhadd          v0.8b,  v0.8b,  v2.8b
 182         urhadd          v1.8b,  v1.8b,  v3.8b
 183         st1             {v0.s}[0], [x5], x1
 184         st1             {v0.s}[1], [x5], x1
 185         st1             {v1.s}[0], [x5], x1
 186         st1             {v1.s}[1], [x5], x1
 187         b.ne            1b
 188         ret
 189 endfunc
 190
 191
 192 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
 193 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
 194 // dst1-dst2 and dst3-dst4 for size >= 16)
 195 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 196         ext             v20.16b, \src1, \src2, #(2*\offset)
 197         ext             v22.16b, \src4, \src5, #(2*\offset)
 198 .if \size >= 16
 199         mla             \dst1, v20.8h, v0.h[\offset]
 200         ext             v21.16b, \src2, \src3, #(2*\offset)
 201         mla             \dst3, v22.8h, v0.h[\offset]
 202         ext             v23.16b, \src5, \src6, #(2*\offset)
 203         mla             \dst2, v21.8h, v0.h[\offset]
 204         mla             \dst4, v23.8h, v0.h[\offset]
 205 .else
 206         mla             \dst1, v20.8h, v0.h[\offset]
 207         mla             \dst3, v22.8h, v0.h[\offset]
 208 .endif
 209 .endm
 210 // The same as above, but don't accumulate straight into the
 211 // destination, but use a temp register and accumulate with saturation.
 212 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 213         ext             v20.16b, \src1, \src2, #(2*\offset)
 214         ext             v22.16b, \src4, \src5, #(2*\offset)
 215 .if \size >= 16
 216         mul             v20.8h, v20.8h, v0.h[\offset]
 217         ext             v21.16b, \src2, \src3, #(2*\offset)
 218         mul             v22.8h, v22.8h, v0.h[\offset]
 219         ext             v23.16b, \src5, \src6, #(2*\offset)
 220         mul             v21.8h, v21.8h, v0.h[\offset]
 221         mul             v23.8h, v23.8h, v0.h[\offset]
 222 .else
 223         mul             v20.8h, v20.8h, v0.h[\offset]
 224         mul             v22.8h, v22.8h, v0.h[\offset]
 225 .endif
 226         sqadd           \dst1, \dst1, v20.8h
 227         sqadd           \dst3, \dst3, v22.8h
 228 .if \size >= 16
 229         sqadd           \dst2, \dst2, v21.8h
 230         sqadd           \dst4, \dst4, v23.8h
 231 .endif
 232 .endm
 233
 234
 235 // Instantiate a horizontal filter function for the given size.
 236 // This can work on 4, 8 or 16 pixels in parallel; for larger
 237 // widths it will do 16 pixels at a time and loop horizontally.
 238 // The actual width is passed in x5, the height in w4 and the
 239 // filter coefficients in x9. idx2 is the index of the largest
 240 // filter coefficient (3 or 4) and idx1 is the other one of them.
 241 .macro do_8tap_h type, size, idx1, idx2
 242 function \type\()_8tap_\size\()h_\idx1\idx2
 243         sub             x2,  x2,  #3
 244         add             x6,  x0,  x1
 245         add             x7,  x2,  x3
 246         add             x1,  x1,  x1
 247         add             x3,  x3,  x3
 248         // Only size >= 16 loops horizontally and needs
 249         // reduced dst stride
 250 .if \size >= 16
 251         sub             x1,  x1,  x5
 252 .endif
 253         // size >= 16 loads two qwords and increments x2,
 254         // for size 4/8 it's enough with one qword and no
 255         // postincrement
 256 .if \size >= 16
 257         sub             x3,  x3,  x5
 258         sub             x3,  x3,  #8
 259 .endif
 260         // Load the filter vector
 261         ld1             {v0.8b},  [x9]
 262         sxtl            v0.8h,  v0.8b
 263 1:
 264 .if \size >= 16
 265         mov             x9,  x5
 266 .endif
 267         // Load src
 268 .if \size >= 16
 269         ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
 270         ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
 271 .else
 272         ld1             {v4.8b,  v5.8b},  [x2]
 273         ld1             {v16.8b, v17.8b}, [x7]
 274 .endif
 275         uxtl            v4.8h,  v4.8b
 276         uxtl            v5.8h,  v5.8b
 277         uxtl            v16.8h, v16.8b
 278         uxtl            v17.8h, v17.8b
 279 .if \size >= 16
 280         uxtl            v6.8h,  v6.8b
 281         uxtl            v18.8h, v18.8b
 282 .endif
 283 2:
 284
 285         // Accumulate, adding idx2 last with a separate
 286         // saturating add. The positive filter coefficients
 287         // for all indices except idx2 must add up to less
 288         // than 127 for this not to overflow.
 289         mul             v1.8h,  v4.8h,  v0.h[0]
 290         mul             v24.8h, v16.8h, v0.h[0]
 291 .if \size >= 16
 292         mul             v2.8h,  v5.8h,  v0.h[0]
 293         mul             v25.8h, v17.8h, v0.h[0]
 294 .endif
 295         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 1,     \size
 296         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 2,     \size
 297         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
 298         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 5,     \size
 299         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 6,     \size
 300         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 7,     \size
 301         extmulqadd      v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
 302
 303         // Round, shift and saturate
 304         sqrshrun        v1.8b,   v1.8h,  #7
 305         sqrshrun        v24.8b,  v24.8h, #7
 306 .if \size >= 16
 307         sqrshrun2       v1.16b,  v2.8h,  #7
 308         sqrshrun2       v24.16b, v25.8h, #7
 309 .endif
 310         // Average
 311 .ifc \type,avg
 312 .if \size >= 16
 313         ld1             {v2.16b}, [x0]
 314         ld1             {v3.16b}, [x6]
 315         urhadd          v1.16b,  v1.16b,  v2.16b
 316         urhadd          v24.16b, v24.16b, v3.16b
 317 .elseif \size == 8
 318         ld1             {v2.8b},  [x0]
 319         ld1             {v3.8b},  [x6]
 320         urhadd          v1.8b,  v1.8b,  v2.8b
 321         urhadd          v24.8b, v24.8b, v3.8b
 322 .else
 323         ld1             {v2.s}[0], [x0]
 324         ld1             {v3.s}[0], [x6]
 325         urhadd          v1.8b,  v1.8b,  v2.8b
 326         urhadd          v24.8b, v24.8b, v3.8b
 327 .endif
 328 .endif
 329         // Store and loop horizontally (for size >= 16)
 330 .if \size >= 16
 331         subs            x9,  x9,  #16
 332         st1             {v1.16b},  [x0], #16
 333         st1             {v24.16b}, [x6], #16
 334         beq             3f
 335         mov             v4.16b,  v6.16b
 336         mov             v16.16b, v18.16b
 337         ld1             {v6.16b},  [x2], #16
 338         ld1             {v18.16b}, [x7], #16
 339         uxtl            v5.8h,  v6.8b
 340         uxtl2           v6.8h,  v6.16b
 341         uxtl            v17.8h, v18.8b
 342         uxtl2           v18.8h, v18.16b
 343         b               2b
 344 .elseif \size == 8
 345         st1             {v1.8b},    [x0]
 346         st1             {v24.8b},   [x6]
 347 .else // \size == 4
 348         st1             {v1.s}[0],  [x0]
 349         st1             {v24.s}[0], [x6]
 350 .endif
 351 3:
 352         // Loop vertically
 353         add             x0,  x0,  x1
 354         add             x6,  x6,  x1
 355         add             x2,  x2,  x3
 356         add             x7,  x7,  x3
 357         subs            w4,  w4,  #2
 358         b.ne            1b
 359         ret
 360 endfunc
 361 .endm
 362
 363 .macro do_8tap_h_size size
 364 do_8tap_h put, \size, 3, 4
 365 do_8tap_h avg, \size, 3, 4
 366 do_8tap_h put, \size, 4, 3
 367 do_8tap_h avg, \size, 4, 3
 368 .endm
 369
 370 do_8tap_h_size 4
 371 do_8tap_h_size 8
 372 do_8tap_h_size 16
 373
 374 .macro do_8tap_h_func type, filter, offset, size
 375 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
 376         movrel          x6,  X(ff_vp9_subpel_filters), 120*\offset - 8
 377         cmp             w5,  #8
 378         add             x9,  x6,  w5, uxtw #3
 379         mov             x5,  #\size
 380 .if \size >= 16
 381         bge             \type\()_8tap_16h_34
 382         b               \type\()_8tap_16h_43
 383 .else
 384         bge             \type\()_8tap_\size\()h_34
 385         b               \type\()_8tap_\size\()h_43
 386 .endif
 387 endfunc
 388 .endm
 389
 390 .macro do_8tap_h_filters size
 391 do_8tap_h_func put, regular, 1, \size
 392 do_8tap_h_func avg, regular, 1, \size
 393 do_8tap_h_func put, sharp,   2, \size
 394 do_8tap_h_func avg, sharp,   2, \size
 395 do_8tap_h_func put, smooth,  0, \size
 396 do_8tap_h_func avg, smooth,  0, \size
 397 .endm
 398
 399 do_8tap_h_filters 64
 400 do_8tap_h_filters 32
 401 do_8tap_h_filters 16
 402 do_8tap_h_filters 8
 403 do_8tap_h_filters 4
 404
 405
 406 // Vertical filters
 407
 408 // Round, shift and saturate and store reg1-reg2 over 4 lines
 409 .macro do_store4 reg1, reg2, tmp1, tmp2, type
 410         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 411         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 412 .ifc \type,avg
 413         ld1             {\tmp1\().s}[0],  [x7], x1
 414         ld1             {\tmp2\().s}[0],  [x7], x1
 415         ld1             {\tmp1\().s}[1],  [x7], x1
 416         ld1             {\tmp2\().s}[1],  [x7], x1
 417         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 418         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 419 .endif
 420         st1             {\reg1\().s}[0],  [x0], x1
 421         st1             {\reg2\().s}[0],  [x0], x1
 422         st1             {\reg1\().s}[1],  [x0], x1
 423         st1             {\reg2\().s}[1],  [x0], x1
 424 .endm
 425
 426 // Round, shift and saturate and store reg1-4
 427 .macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
 428         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 429         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 430         sqrshrun        \reg3\().8b,  \reg3\().8h, #7
 431         sqrshrun        \reg4\().8b,  \reg4\().8h, #7
 432 .ifc \type,avg
 433         ld1             {\tmp1\().8b},  [x7], x1
 434         ld1             {\tmp2\().8b},  [x7], x1
 435         ld1             {\tmp3\().8b},  [x7], x1
 436         ld1             {\tmp4\().8b},  [x7], x1
 437         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 438         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 439         urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
 440         urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
 441 .endif
 442         st1             {\reg1\().8b},  [x0], x1
 443         st1             {\reg2\().8b},  [x0], x1
 444         st1             {\reg3\().8b},  [x0], x1
 445         st1             {\reg4\().8b},  [x0], x1
 446 .endm
 447
 448 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
 449 // (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
 450 // at the end with saturation. Indices 0 and 7 always have negative or zero
 451 // coefficients, so they can be accumulated into tmp1-tmp2 together with the
 452 // largest coefficient.
 453 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
 454         mul             \dst1\().8h, \src2\().8h, v0.h[1]
 455         mul             \dst2\().8h, \src3\().8h, v0.h[1]
 456         mul             \tmp1\().8h, \src1\().8h, v0.h[0]
 457         mul             \tmp2\().8h, \src2\().8h, v0.h[0]
 458         mla             \dst1\().8h, \src3\().8h, v0.h[2]
 459         mla             \dst2\().8h, \src4\().8h, v0.h[2]
 460 .if \idx1 == 3
 461         mla             \dst1\().8h, \src4\().8h, v0.h[3]
 462         mla             \dst2\().8h, \src5\().8h, v0.h[3]
 463 .else
 464         mla             \dst1\().8h, \src5\().8h, v0.h[4]
 465         mla             \dst2\().8h, \src6\().8h, v0.h[4]
 466 .endif
 467         mla             \dst1\().8h, \src6\().8h, v0.h[5]
 468         mla             \dst2\().8h, \src7\().8h, v0.h[5]
 469         mla             \tmp1\().8h, \src8\().8h, v0.h[7]
 470         mla             \tmp2\().8h, \src9\().8h, v0.h[7]
 471         mla             \dst1\().8h, \src7\().8h, v0.h[6]
 472         mla             \dst2\().8h, \src8\().8h, v0.h[6]
 473 .if \idx2 == 3
 474         mla             \tmp1\().8h, \src4\().8h, v0.h[3]
 475         mla             \tmp2\().8h, \src5\().8h, v0.h[3]
 476 .else
 477         mla             \tmp1\().8h, \src5\().8h, v0.h[4]
 478         mla             \tmp2\().8h, \src6\().8h, v0.h[4]
 479 .endif
 480         sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
 481         sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
 482 .endm
 483
 484 // Load pixels and extend them to 16 bit
 485 .macro loadl dst1, dst2, dst3, dst4
 486         ld1             {v1.8b}, [x2], x3
 487         ld1             {v2.8b}, [x2], x3
 488         ld1             {v3.8b}, [x2], x3
 489 .ifnb \dst4
 490         ld1             {v4.8b}, [x2], x3
 491 .endif
 492         uxtl            \dst1\().8h, v1.8b
 493         uxtl            \dst2\().8h, v2.8b
 494         uxtl            \dst3\().8h, v3.8b
 495 .ifnb \dst4
 496         uxtl            \dst4\().8h, v4.8b
 497 .endif
 498 .endm
 499
 500 // Instantiate a vertical filter function for filtering 8 pixels at a time.
 501 // The height is passed in x4, the width in x5 and the filter coefficients
 502 // in x6. idx2 is the index of the largest filter coefficient (3 or 4)
 503 // and idx1 is the other one of them.
 504 .macro do_8tap_8v type, idx1, idx2
 505 function \type\()_8tap_8v_\idx1\idx2
 506         sub             x2,  x2,  x3, lsl #1
 507         sub             x2,  x2,  x3
 508         ld1             {v0.8b},  [x6]
 509         sxtl            v0.8h,  v0.8b
 510 1:
 511 .ifc \type,avg
 512         mov             x7,  x0
 513 .endif
 514         mov             x6,  x4
 515
 516         loadl           v17, v18, v19
 517
 518         loadl           v20, v21, v22, v23
 519 2:
 520         loadl           v24, v25, v26, v27
 521         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
 522         convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
 523         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 524
 525         subs            x6,  x6,  #4
 526         b.eq            8f
 527
 528         loadl           v16, v17, v18, v19
 529         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
 530         convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
 531         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 532
 533         subs            x6,  x6,  #4
 534         b.eq            8f
 535
 536         loadl           v20, v21, v22, v23
 537         convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
 538         convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
 539         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 540
 541         subs            x6,  x6,  #4
 542         b.ne            2b
 543
 544 8:
 545         subs            x5,  x5,  #8
 546         b.eq            9f
 547         // x0 -= h * dst_stride
 548         msub            x0,  x1,  x4, x0
 549         // x2 -= h * src_stride
 550         msub            x2,  x3,  x4, x2
 551         // x2 -= 8 * src_stride
 552         sub             x2,  x2,  x3, lsl #3
 553         // x2 += 1 * src_stride
 554         add             x2,  x2,  x3
 555         add             x2,  x2,  #8
 556         add             x0,  x0,  #8
 557         b               1b
 558 9:
 559         ret
 560 endfunc
 561 .endm
 562
 563 do_8tap_8v put, 3, 4
 564 do_8tap_8v put, 4, 3
 565 do_8tap_8v avg, 3, 4
 566 do_8tap_8v avg, 4, 3
 567
 568
 569 // Instantiate a vertical filter function for filtering a 4 pixels wide
 570 // slice. The first half of the registers contain one row, while the second
 571 // half of a register contains the second-next row (also stored in the first
 572 // half of the register two steps ahead). The convolution does two outputs
 573 // at a time; the output of v17-v24 into one, and v18-v25 into another one.
 574 // The first half of first output is the first output row, the first half
 575 // of the other output is the second output row. The second halves of the
 576 // registers are rows 3 and 4.
 577 // This only is designed to work for 4 or 8 output lines.
 578 .macro do_8tap_4v type, idx1, idx2
 579 function \type\()_8tap_4v_\idx1\idx2
 580         sub             x2,  x2,  x3, lsl #1
 581         sub             x2,  x2,  x3
 582         ld1             {v0.8b},  [x6]
 583         sxtl            v0.8h,  v0.8b
 584 .ifc \type,avg
 585         mov             x7,  x0
 586 .endif
 587
 588         ld1             {v1.s}[0],  [x2], x3
 589         ld1             {v2.s}[0],  [x2], x3
 590         ld1             {v3.s}[0],  [x2], x3
 591         ld1             {v4.s}[0],  [x2], x3
 592         ld1             {v5.s}[0],  [x2], x3
 593         ld1             {v6.s}[0],  [x2], x3
 594         trn1            v1.2s,  v1.2s,  v3.2s
 595         ld1             {v7.s}[0],  [x2], x3
 596         trn1            v2.2s,  v2.2s,  v4.2s
 597         ld1             {v26.s}[0], [x2], x3
 598         uxtl            v17.8h, v1.8b
 599         trn1            v3.2s,  v3.2s,  v5.2s
 600         ld1             {v27.s}[0], [x2], x3
 601         uxtl            v18.8h, v2.8b
 602         trn1            v4.2s,  v4.2s,  v6.2s
 603         ld1             {v28.s}[0], [x2], x3
 604         uxtl            v19.8h, v3.8b
 605         trn1            v5.2s,  v5.2s,  v7.2s
 606         ld1             {v29.s}[0], [x2], x3
 607         uxtl            v20.8h, v4.8b
 608         trn1            v6.2s,  v6.2s,  v26.2s
 609         uxtl            v21.8h, v5.8b
 610         trn1            v7.2s,  v7.2s,  v27.2s
 611         uxtl            v22.8h, v6.8b
 612         trn1            v26.2s, v26.2s, v28.2s
 613         uxtl            v23.8h, v7.8b
 614         trn1            v27.2s, v27.2s, v29.2s
 615         uxtl            v24.8h, v26.8b
 616         uxtl            v25.8h, v27.8b
 617
 618         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
 619         do_store4       v1,  v2,  v5,  v6,  \type
 620
 621         subs            x4,  x4,  #4
 622         b.eq            9f
 623
 624         ld1             {v1.s}[0],  [x2], x3
 625         ld1             {v2.s}[0],  [x2], x3
 626         trn1            v28.2s, v28.2s, v1.2s
 627         trn1            v29.2s, v29.2s, v2.2s
 628         ld1             {v1.s}[1],  [x2], x3
 629         uxtl            v26.8h, v28.8b
 630         ld1             {v2.s}[1],  [x2], x3
 631         uxtl            v27.8h, v29.8b
 632         uxtl            v28.8h, v1.8b
 633         uxtl            v29.8h, v2.8b
 634
 635         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
 636         do_store4       v1,  v2,  v5,  v6,  \type
 637
 638 9:
 639         ret
 640 endfunc
 641 .endm
 642
 643 do_8tap_4v put, 3, 4
 644 do_8tap_4v put, 4, 3
 645 do_8tap_4v avg, 3, 4
 646 do_8tap_4v avg, 4, 3
 647
 648
 649 .macro do_8tap_v_func type, filter, offset, size
 650 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
 651         uxtw            x4,  w4
 652         movrel          x5,  X(ff_vp9_subpel_filters), 120*\offset - 8
 653         cmp             w6,  #8
 654         add             x6,  x5,  w6, uxtw #3
 655         mov             x5,  #\size
 656 .if \size >= 8
 657         b.ge            \type\()_8tap_8v_34
 658         b               \type\()_8tap_8v_43
 659 .else
 660         b.ge            \type\()_8tap_4v_34
 661         b               \type\()_8tap_4v_43
 662 .endif
 663 endfunc
 664 .endm
 665
 666 .macro do_8tap_v_filters size
 667 do_8tap_v_func put, regular, 1, \size
 668 do_8tap_v_func avg, regular, 1, \size
 669 do_8tap_v_func put, sharp,   2, \size
 670 do_8tap_v_func avg, sharp,   2, \size
 671 do_8tap_v_func put, smooth,  0, \size
 672 do_8tap_v_func avg, smooth,  0, \size
 673 .endm
 674
 675 do_8tap_v_filters 64
 676 do_8tap_v_filters 32
 677 do_8tap_v_filters 16
 678 do_8tap_v_filters 8
 679 do_8tap_v_filters 4