git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9mc_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22
  23 // All public functions in this file have the following signature:
  24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
  25 //                            const uint8_t *ref, ptrdiff_t ref_stride,
  26 //                            int h, int mx, int my);
  27
  28 function ff_vp9_copy64_aarch64, export=1
  29 1:
  30         ldp             x5,  x6,  [x2]
  31         ldp             x7,  x8,  [x2, #16]
  32         stp             x5,  x6,  [x0]
  33         ldp             x9,  x10, [x2, #32]
  34         stp             x7,  x8,  [x0, #16]
  35         subs            w4,  w4,  #1
  36         ldp             x11, x12, [x2, #48]
  37         stp             x9,  x10, [x0, #32]
  38         stp             x11, x12, [x0, #48]
  39         add             x2,  x2,  x3
  40         add             x0,  x0,  x1
  41         b.ne            1b
  42         ret
  43 endfunc
  44
  45 function ff_vp9_avg64_neon, export=1
  46         mov             x5,  x0
  47 1:
  48         ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
  49         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
  50         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
  51         urhadd          v0.16b,  v0.16b,  v4.16b
  52         urhadd          v1.16b,  v1.16b,  v5.16b
  53         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
  54         urhadd          v2.16b,  v2.16b,  v6.16b
  55         urhadd          v3.16b,  v3.16b,  v7.16b
  56         subs            w4,  w4,  #2
  57         urhadd          v16.16b, v16.16b, v20.16b
  58         urhadd          v17.16b, v17.16b, v21.16b
  59         st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
  60         urhadd          v18.16b, v18.16b, v22.16b
  61         urhadd          v19.16b, v19.16b, v23.16b
  62         st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
  63         b.ne            1b
  64         ret
  65 endfunc
  66
  67 function ff_vp9_copy32_aarch64, export=1
  68 1:
  69         ldp             x5,  x6,  [x2]
  70         ldp             x7,  x8,  [x2, #16]
  71         stp             x5,  x6,  [x0]
  72         subs            w4,  w4,  #1
  73         stp             x7,  x8,  [x0, #16]
  74         add             x2,  x2,  x3
  75         add             x0,  x0,  x1
  76         b.ne            1b
  77         ret
  78 endfunc
  79
  80 function ff_vp9_avg32_neon, export=1
  81 1:
  82         ld1             {v2.16b, v3.16b},  [x2], x3
  83         ld1             {v0.16b, v1.16b},  [x0]
  84         urhadd          v0.16b,  v0.16b,  v2.16b
  85         urhadd          v1.16b,  v1.16b,  v3.16b
  86         subs            w4,  w4,  #1
  87         st1             {v0.16b, v1.16b},  [x0], x1
  88         b.ne            1b
  89         ret
  90 endfunc
  91
  92 function ff_vp9_copy16_neon, export=1
  93         add             x5,  x0,  x1
  94         lsl             x1,  x1,  #1
  95         add             x6,  x2,  x3
  96         lsl             x3,  x3,  #1
  97 1:
  98         ld1             {v0.16b},  [x2], x3
  99         ld1             {v1.16b},  [x6], x3
 100         ld1             {v2.16b},  [x2], x3
 101         ld1             {v3.16b},  [x6], x3
 102         subs            w4,  w4,  #4
 103         st1             {v0.16b},  [x0], x1
 104         st1             {v1.16b},  [x5], x1
 105         st1             {v2.16b},  [x0], x1
 106         st1             {v3.16b},  [x5], x1
 107         b.ne            1b
 108         ret
 109 endfunc
 110
 111 function ff_vp9_avg16_neon, export=1
 112         mov             x5,  x0
 113 1:
 114         ld1             {v2.16b},  [x2], x3
 115         ld1             {v0.16b},  [x0], x1
 116         ld1             {v3.16b},  [x2], x3
 117         urhadd          v0.16b,  v0.16b,  v2.16b
 118         ld1             {v1.16b},  [x0], x1
 119         urhadd          v1.16b,  v1.16b,  v3.16b
 120         subs            w4,  w4,  #2
 121         st1             {v0.16b},  [x5], x1
 122         st1             {v1.16b},  [x5], x1
 123         b.ne            1b
 124         ret
 125 endfunc
 126
 127 function ff_vp9_copy8_neon, export=1
 128 1:
 129         ld1             {v0.8b},  [x2], x3
 130         ld1             {v1.8b},  [x2], x3
 131         subs            w4,  w4,  #2
 132         st1             {v0.8b},  [x0], x1
 133         st1             {v1.8b},  [x0], x1
 134         b.ne            1b
 135         ret
 136 endfunc
 137
 138 function ff_vp9_avg8_neon, export=1
 139         mov             x5,  x0
 140 1:
 141         ld1             {v2.8b},  [x2], x3
 142         ld1             {v0.8b},  [x0], x1
 143         ld1             {v3.8b},  [x2], x3
 144         urhadd          v0.8b,  v0.8b,  v2.8b
 145         ld1             {v1.8b},  [x0], x1
 146         urhadd          v1.8b,  v1.8b,  v3.8b
 147         subs            w4,  w4,  #2
 148         st1             {v0.8b},  [x5], x1
 149         st1             {v1.8b},  [x5], x1
 150         b.ne            1b
 151         ret
 152 endfunc
 153
 154 function ff_vp9_copy4_neon, export=1
 155 1:
 156         ld1             {v0.s}[0], [x2], x3
 157         ld1             {v1.s}[0], [x2], x3
 158         st1             {v0.s}[0], [x0], x1
 159         ld1             {v2.s}[0], [x2], x3
 160         st1             {v1.s}[0], [x0], x1
 161         ld1             {v3.s}[0], [x2], x3
 162         subs            w4,  w4,  #4
 163         st1             {v2.s}[0], [x0], x1
 164         st1             {v3.s}[0], [x0], x1
 165         b.ne            1b
 166         ret
 167 endfunc
 168
 169 function ff_vp9_avg4_neon, export=1
 170         mov             x5,  x0
 171 1:
 172         ld1             {v2.s}[0], [x2], x3
 173         ld1             {v0.s}[0], [x0], x1
 174         ld1             {v2.s}[1], [x2], x3
 175         ld1             {v0.s}[1], [x0], x1
 176         ld1             {v3.s}[0], [x2], x3
 177         ld1             {v1.s}[0], [x0], x1
 178         ld1             {v3.s}[1], [x2], x3
 179         ld1             {v1.s}[1], [x0], x1
 180         subs            w4,  w4,  #4
 181         urhadd          v0.8b,  v0.8b,  v2.8b
 182         urhadd          v1.8b,  v1.8b,  v3.8b
 183         st1             {v0.s}[0], [x5], x1
 184         st1             {v0.s}[1], [x5], x1
 185         st1             {v1.s}[0], [x5], x1
 186         st1             {v1.s}[1], [x5], x1
 187         b.ne            1b
 188         ret
 189 endfunc
 190
 191
 192 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
 193 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
 194 // dst1-dst2 and dst3-dst4 for size >= 16)
 195 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 196         ext             v20.16b, \src1, \src2, #(2*\offset)
 197         ext             v22.16b, \src4, \src5, #(2*\offset)
 198 .if \size >= 16
 199         mla             \dst1, v20.8h, v0.h[\offset]
 200         ext             v21.16b, \src2, \src3, #(2*\offset)
 201         mla             \dst3, v22.8h, v0.h[\offset]
 202         ext             v23.16b, \src5, \src6, #(2*\offset)
 203         mla             \dst2, v21.8h, v0.h[\offset]
 204         mla             \dst4, v23.8h, v0.h[\offset]
 205 .else
 206         mla             \dst1, v20.8h, v0.h[\offset]
 207         mla             \dst3, v22.8h, v0.h[\offset]
 208 .endif
 209 .endm
 210 // The same as above, but don't accumulate straight into the
 211 // destination, but use a temp register and accumulate with saturation.
 212 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 213         ext             v20.16b, \src1, \src2, #(2*\offset)
 214         ext             v22.16b, \src4, \src5, #(2*\offset)
 215 .if \size >= 16
 216         mul             v20.8h, v20.8h, v0.h[\offset]
 217         ext             v21.16b, \src2, \src3, #(2*\offset)
 218         mul             v22.8h, v22.8h, v0.h[\offset]
 219         ext             v23.16b, \src5, \src6, #(2*\offset)
 220         mul             v21.8h, v21.8h, v0.h[\offset]
 221         mul             v23.8h, v23.8h, v0.h[\offset]
 222 .else
 223         mul             v20.8h, v20.8h, v0.h[\offset]
 224         mul             v22.8h, v22.8h, v0.h[\offset]
 225 .endif
 226         sqadd           \dst1, \dst1, v20.8h
 227         sqadd           \dst3, \dst3, v22.8h
 228 .if \size >= 16
 229         sqadd           \dst2, \dst2, v21.8h
 230         sqadd           \dst4, \dst4, v23.8h
 231 .endif
 232 .endm
 233
 234
 235 // Instantiate a horizontal filter function for the given size.
 236 // This can work on 4, 8 or 16 pixels in parallel; for larger
 237 // widths it will do 16 pixels at a time and loop horizontally.
 238 // The actual width is passed in x5, the height in w4 and the
 239 // filter coefficients in x9. idx2 is the index of the largest
 240 // filter coefficient (3 or 4) and idx1 is the other one of them.
 241 .macro do_8tap_h type, size, idx1, idx2
 242 function \type\()_8tap_\size\()h_\idx1\idx2
 243         sub             x2,  x2,  #3
 244         add             x6,  x0,  x1
 245         add             x7,  x2,  x3
 246         add             x1,  x1,  x1
 247         add             x3,  x3,  x3
 248         // Only size >= 16 loops horizontally and needs
 249         // reduced dst stride
 250 .if \size >= 16
 251         sub             x1,  x1,  x5
 252 .endif
 253         // size >= 16 loads two qwords and increments r2,
 254         // for size 4/8 it's enough with one qword and no
 255         // postincrement
 256 .if \size >= 16
 257         sub             x3,  x3,  x5
 258         sub             x3,  x3,  #8
 259 .endif
 260         // Load the filter vector
 261         ld1             {v0.8h},  [x9]
 262 1:
 263 .if \size >= 16
 264         mov             x9,  x5
 265 .endif
 266         // Load src
 267 .if \size >= 16
 268         ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
 269         ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
 270 .else
 271         ld1             {v4.8b,  v5.8b},  [x2]
 272         ld1             {v16.8b, v17.8b}, [x7]
 273 .endif
 274         uxtl            v4.8h,  v4.8b
 275         uxtl            v5.8h,  v5.8b
 276         uxtl            v16.8h, v16.8b
 277         uxtl            v17.8h, v17.8b
 278 .if \size >= 16
 279         uxtl            v6.8h,  v6.8b
 280         uxtl            v18.8h, v18.8b
 281 .endif
 282 2:
 283
 284         // Accumulate, adding idx2 last with a separate
 285         // saturating add. The positive filter coefficients
 286         // for all indices except idx2 must add up to less
 287         // than 127 for this not to overflow.
 288         mul             v1.8h,  v4.8h,  v0.h[0]
 289         mul             v24.8h, v16.8h, v0.h[0]
 290 .if \size >= 16
 291         mul             v2.8h,  v5.8h,  v0.h[0]
 292         mul             v25.8h, v17.8h, v0.h[0]
 293 .endif
 294         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 1,     \size
 295         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 2,     \size
 296         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
 297         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 5,     \size
 298         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 6,     \size
 299         extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 7,     \size
 300         extmulqadd      v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
 301
 302         // Round, shift and saturate
 303         sqrshrun        v1.8b,   v1.8h,  #7
 304         sqrshrun        v24.8b,  v24.8h, #7
 305 .if \size >= 16
 306         sqrshrun2       v1.16b,  v2.8h,  #7
 307         sqrshrun2       v24.16b, v25.8h, #7
 308 .endif
 309         // Average
 310 .ifc \type,avg
 311 .if \size >= 16
 312         ld1             {v2.16b}, [x0]
 313         ld1             {v3.16b}, [x6]
 314         urhadd          v1.16b,  v1.16b,  v2.16b
 315         urhadd          v24.16b, v24.16b, v3.16b
 316 .elseif \size == 8
 317         ld1             {v2.8b},  [x0]
 318         ld1             {v3.8b},  [x6]
 319         urhadd          v1.8b,  v1.8b,  v2.8b
 320         urhadd          v24.8b, v24.8b, v3.8b
 321 .else
 322         ld1             {v2.s}[0], [x0]
 323         ld1             {v3.s}[0], [x6]
 324         urhadd          v1.8b,  v1.8b,  v2.8b
 325         urhadd          v24.8b, v24.8b, v3.8b
 326 .endif
 327 .endif
 328         // Store and loop horizontally (for size >= 16)
 329 .if \size >= 16
 330         subs            x9,  x9,  #16
 331         st1             {v1.16b},  [x0], #16
 332         st1             {v24.16b}, [x6], #16
 333         beq             3f
 334         mov             v4.16b,  v6.16b
 335         mov             v16.16b, v18.16b
 336         ld1             {v6.16b},  [x2], #16
 337         ld1             {v18.16b}, [x7], #16
 338         uxtl            v5.8h,  v6.8b
 339         uxtl2           v6.8h,  v6.16b
 340         uxtl            v17.8h, v18.8b
 341         uxtl2           v18.8h, v18.16b
 342         b               2b
 343 .elseif \size == 8
 344         st1             {v1.8b},    [x0]
 345         st1             {v24.8b},   [x6]
 346 .else // \size == 4
 347         st1             {v1.s}[0],  [x0]
 348         st1             {v24.s}[0], [x6]
 349 .endif
 350 3:
 351         // Loop vertically
 352         add             x0,  x0,  x1
 353         add             x6,  x6,  x1
 354         add             x2,  x2,  x3
 355         add             x7,  x7,  x3
 356         subs            w4,  w4,  #2
 357         b.ne            1b
 358         ret
 359 endfunc
 360 .endm
 361
 362 .macro do_8tap_h_size size
 363 do_8tap_h put, \size, 3, 4
 364 do_8tap_h avg, \size, 3, 4
 365 do_8tap_h put, \size, 4, 3
 366 do_8tap_h avg, \size, 4, 3
 367 .endm
 368
 369 do_8tap_h_size 4
 370 do_8tap_h_size 8
 371 do_8tap_h_size 16
 372
 373 .macro do_8tap_h_func type, filter, offset, size
 374 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
 375         movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
 376         cmp             w5,  #8
 377         add             x9,  x6,  w5, uxtw #4
 378         mov             x5,  #\size
 379 .if \size >= 16
 380         bge             \type\()_8tap_16h_34
 381         b               \type\()_8tap_16h_43
 382 .else
 383         bge             \type\()_8tap_\size\()h_34
 384         b               \type\()_8tap_\size\()h_43
 385 .endif
 386 endfunc
 387 .endm
 388
 389 .macro do_8tap_h_filters size
 390 do_8tap_h_func put, regular, 1, \size
 391 do_8tap_h_func avg, regular, 1, \size
 392 do_8tap_h_func put, sharp,   2, \size
 393 do_8tap_h_func avg, sharp,   2, \size
 394 do_8tap_h_func put, smooth,  0, \size
 395 do_8tap_h_func avg, smooth,  0, \size
 396 .endm
 397
 398 do_8tap_h_filters 64
 399 do_8tap_h_filters 32
 400 do_8tap_h_filters 16
 401 do_8tap_h_filters 8
 402 do_8tap_h_filters 4
 403
 404
 405 // Vertical filters
 406
 407 // Round, shift and saturate and store reg1-reg2 over 4 lines
 408 .macro do_store4 reg1, reg2, tmp1, tmp2, type
 409         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 410         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 411 .ifc \type,avg
 412         ld1             {\tmp1\().s}[0],  [x7], x1
 413         ld1             {\tmp2\().s}[0],  [x7], x1
 414         ld1             {\tmp1\().s}[1],  [x7], x1
 415         ld1             {\tmp2\().s}[1],  [x7], x1
 416         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 417         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 418 .endif
 419         st1             {\reg1\().s}[0],  [x0], x1
 420         st1             {\reg2\().s}[0],  [x0], x1
 421         st1             {\reg1\().s}[1],  [x0], x1
 422         st1             {\reg2\().s}[1],  [x0], x1
 423 .endm
 424
 425 // Round, shift and saturate and store reg1-4
 426 .macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
 427         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 428         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 429         sqrshrun        \reg3\().8b,  \reg3\().8h, #7
 430         sqrshrun        \reg4\().8b,  \reg4\().8h, #7
 431 .ifc \type,avg
 432         ld1             {\tmp1\().8b},  [x7], x1
 433         ld1             {\tmp2\().8b},  [x7], x1
 434         ld1             {\tmp3\().8b},  [x7], x1
 435         ld1             {\tmp4\().8b},  [x7], x1
 436         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 437         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 438         urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
 439         urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
 440 .endif
 441         st1             {\reg1\().8b},  [x0], x1
 442         st1             {\reg2\().8b},  [x0], x1
 443         st1             {\reg3\().8b},  [x0], x1
 444         st1             {\reg4\().8b},  [x0], x1
 445 .endm
 446
 447 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
 448 // (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
 449 // at the end with saturation. Indices 0 and 7 always have negative or zero
 450 // coefficients, so they can be accumulated into tmp1-tmp2 together with the
 451 // largest coefficient.
 452 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
 453         mul             \dst1\().8h, \src2\().8h, v0.h[1]
 454         mul             \dst2\().8h, \src3\().8h, v0.h[1]
 455         mul             \tmp1\().8h, \src1\().8h, v0.h[0]
 456         mul             \tmp2\().8h, \src2\().8h, v0.h[0]
 457         mla             \dst1\().8h, \src3\().8h, v0.h[2]
 458         mla             \dst2\().8h, \src4\().8h, v0.h[2]
 459 .if \idx1 == 3
 460         mla             \dst1\().8h, \src4\().8h, v0.h[3]
 461         mla             \dst2\().8h, \src5\().8h, v0.h[3]
 462 .else
 463         mla             \dst1\().8h, \src5\().8h, v0.h[4]
 464         mla             \dst2\().8h, \src6\().8h, v0.h[4]
 465 .endif
 466         mla             \dst1\().8h, \src6\().8h, v0.h[5]
 467         mla             \dst2\().8h, \src7\().8h, v0.h[5]
 468         mla             \tmp1\().8h, \src8\().8h, v0.h[7]
 469         mla             \tmp2\().8h, \src9\().8h, v0.h[7]
 470         mla             \dst1\().8h, \src7\().8h, v0.h[6]
 471         mla             \dst2\().8h, \src8\().8h, v0.h[6]
 472 .if \idx2 == 3
 473         mla             \tmp1\().8h, \src4\().8h, v0.h[3]
 474         mla             \tmp2\().8h, \src5\().8h, v0.h[3]
 475 .else
 476         mla             \tmp1\().8h, \src5\().8h, v0.h[4]
 477         mla             \tmp2\().8h, \src6\().8h, v0.h[4]
 478 .endif
 479         sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
 480         sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
 481 .endm
 482
 483 // Load pixels and extend them to 16 bit
 484 .macro loadl dst1, dst2, dst3, dst4
 485         ld1             {v1.8b}, [x2], x3
 486         ld1             {v2.8b}, [x2], x3
 487         ld1             {v3.8b}, [x2], x3
 488 .ifnb \dst4
 489         ld1             {v4.8b}, [x2], x3
 490 .endif
 491         uxtl            \dst1\().8h, v1.8b
 492         uxtl            \dst2\().8h, v2.8b
 493         uxtl            \dst3\().8h, v3.8b
 494 .ifnb \dst4
 495         uxtl            \dst4\().8h, v4.8b
 496 .endif
 497 .endm
 498
 499 // Instantiate a vertical filter function for filtering 8 pixels at a time.
 500 // The height is passed in x4, the width in x5 and the filter coefficients
 501 // in x6. idx2 is the index of the largest filter coefficient (3 or 4)
 502 // and idx1 is the other one of them.
 503 .macro do_8tap_8v type, idx1, idx2
 504 function \type\()_8tap_8v_\idx1\idx2
 505         sub             x2,  x2,  x3, lsl #1
 506         sub             x2,  x2,  x3
 507         ld1             {v0.8h},  [x6]
 508 1:
 509 .ifc \type,avg
 510         mov             x7,  x0
 511 .endif
 512         mov             x6,  x4
 513
 514         loadl           v17, v18, v19
 515
 516         loadl           v20, v21, v22, v23
 517 2:
 518         loadl           v24, v25, v26, v27
 519         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
 520         convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
 521         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 522
 523         subs            x6,  x6,  #4
 524         b.eq            8f
 525
 526         loadl           v16, v17, v18, v19
 527         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
 528         convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
 529         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 530
 531         subs            x6,  x6,  #4
 532         b.eq            8f
 533
 534         loadl           v20, v21, v22, v23
 535         convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
 536         convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
 537         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 538
 539         subs            x6,  x6,  #4
 540         b.ne            2b
 541
 542 8:
 543         subs            x5,  x5,  #8
 544         b.eq            9f
 545         // x0 -= h * dst_stride
 546         msub            x0,  x1,  x4, x0
 547         // x2 -= h * src_stride
 548         msub            x2,  x3,  x4, x2
 549         // x2 -= 8 * src_stride
 550         sub             x2,  x2,  x3, lsl #3
 551         // x2 += 1 * src_stride
 552         add             x2,  x2,  x3
 553         add             x2,  x2,  #8
 554         add             x0,  x0,  #8
 555         b               1b
 556 9:
 557         ret
 558 endfunc
 559 .endm
 560
 561 do_8tap_8v put, 3, 4
 562 do_8tap_8v put, 4, 3
 563 do_8tap_8v avg, 3, 4
 564 do_8tap_8v avg, 4, 3
 565
 566
 567 // Instantiate a vertical filter function for filtering a 4 pixels wide
 568 // slice. The first half of the registers contain one row, while the second
 569 // half of a register contains the second-next row (also stored in the first
 570 // half of the register two steps ahead). The convolution does two outputs
 571 // at a time; the output of v17-v24 into one, and v18-v25 into another one.
 572 // The first half of first output is the first output row, the first half
 573 // of the other output is the second output row. The second halves of the
 574 // registers are rows 3 and 4.
 575 // This only is designed to work for 4 or 8 output lines.
 576 .macro do_8tap_4v type, idx1, idx2
 577 function \type\()_8tap_4v_\idx1\idx2
 578         sub             x2,  x2,  x3, lsl #1
 579         sub             x2,  x2,  x3
 580         ld1             {v0.8h},  [x6]
 581 .ifc \type,avg
 582         mov             x7,  x0
 583 .endif
 584
 585         ld1             {v1.s}[0],  [x2], x3
 586         ld1             {v2.s}[0],  [x2], x3
 587         ld1             {v3.s}[0],  [x2], x3
 588         ld1             {v4.s}[0],  [x2], x3
 589         ld1             {v5.s}[0],  [x2], x3
 590         ld1             {v6.s}[0],  [x2], x3
 591         trn1            v1.2s,  v1.2s,  v3.2s
 592         ld1             {v7.s}[0],  [x2], x3
 593         trn1            v2.2s,  v2.2s,  v4.2s
 594         ld1             {v26.s}[0], [x2], x3
 595         uxtl            v17.8h, v1.8b
 596         trn1            v3.2s,  v3.2s,  v5.2s
 597         ld1             {v27.s}[0], [x2], x3
 598         uxtl            v18.8h, v2.8b
 599         trn1            v4.2s,  v4.2s,  v6.2s
 600         ld1             {v28.s}[0], [x2], x3
 601         uxtl            v19.8h, v3.8b
 602         trn1            v5.2s,  v5.2s,  v7.2s
 603         ld1             {v29.s}[0], [x2], x3
 604         uxtl            v20.8h, v4.8b
 605         trn1            v6.2s,  v6.2s,  v26.2s
 606         uxtl            v21.8h, v5.8b
 607         trn1            v7.2s,  v7.2s,  v27.2s
 608         uxtl            v22.8h, v6.8b
 609         trn1            v26.2s, v26.2s, v28.2s
 610         uxtl            v23.8h, v7.8b
 611         trn1            v27.2s, v27.2s, v29.2s
 612         uxtl            v24.8h, v26.8b
 613         uxtl            v25.8h, v27.8b
 614
 615         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
 616         do_store4       v1,  v2,  v5,  v6,  \type
 617
 618         subs            x4,  x4,  #4
 619         b.eq            9f
 620
 621         ld1             {v1.s}[0],  [x2], x3
 622         ld1             {v2.s}[0],  [x2], x3
 623         trn1            v28.2s, v28.2s, v1.2s
 624         trn1            v29.2s, v29.2s, v2.2s
 625         ld1             {v1.s}[1],  [x2], x3
 626         uxtl            v26.8h, v28.8b
 627         ld1             {v2.s}[1],  [x2], x3
 628         uxtl            v27.8h, v29.8b
 629         uxtl            v28.8h, v1.8b
 630         uxtl            v29.8h, v2.8b
 631
 632         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
 633         do_store4       v1,  v2,  v5,  v6,  \type
 634
 635 9:
 636         ret
 637 endfunc
 638 .endm
 639
 640 do_8tap_4v put, 3, 4
 641 do_8tap_4v put, 4, 3
 642 do_8tap_4v avg, 3, 4
 643 do_8tap_4v avg, 4, 3
 644
 645
 646 .macro do_8tap_v_func type, filter, offset, size
 647 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
 648         uxtw            x4,  w4
 649         movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
 650         cmp             w6,  #8
 651         add             x6,  x5,  w6, uxtw #4
 652         mov             x5,  #\size
 653 .if \size >= 8
 654         b.ge            \type\()_8tap_8v_34
 655         b               \type\()_8tap_8v_43
 656 .else
 657         b.ge            \type\()_8tap_4v_34
 658         b               \type\()_8tap_4v_43
 659 .endif
 660 endfunc
 661 .endm
 662
 663 .macro do_8tap_v_filters size
 664 do_8tap_v_func put, regular, 1, \size
 665 do_8tap_v_func avg, regular, 1, \size
 666 do_8tap_v_func put, sharp,   2, \size
 667 do_8tap_v_func avg, sharp,   2, \size
 668 do_8tap_v_func put, smooth,  0, \size
 669 do_8tap_v_func avg, smooth,  0, \size
 670 .endm
 671
 672 do_8tap_v_filters 64
 673 do_8tap_v_filters 32
 674 do_8tap_v_filters 16
 675 do_8tap_v_filters 8
 676 do_8tap_v_filters 4