git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9mc_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22
  23 // All public functions in this file have the following signature:
  24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
  25 //                            const uint8_t *ref, ptrdiff_t ref_stride,
  26 //                            int h, int mx, int my);
  27
  28 function ff_vp9_copy64_aarch64, export=1
  29 1:
  30         ldp             x5,  x6,  [x2]
  31         ldp             x7,  x8,  [x2, #16]
  32         stp             x5,  x6,  [x0]
  33         ldp             x9,  x10, [x2, #32]
  34         stp             x7,  x8,  [x0, #16]
  35         subs            w4,  w4,  #1
  36         ldp             x11, x12, [x2, #48]
  37         stp             x9,  x10, [x0, #32]
  38         stp             x11, x12, [x0, #48]
  39         add             x2,  x2,  x3
  40         add             x0,  x0,  x1
  41         b.ne            1b
  42         ret
  43 endfunc
  44
  45 function ff_vp9_avg64_neon, export=1
  46         mov             x5,  x0
  47 1:
  48         ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
  49         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
  50         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
  51         urhadd          v0.16b,  v0.16b,  v4.16b
  52         urhadd          v1.16b,  v1.16b,  v5.16b
  53         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
  54         urhadd          v2.16b,  v2.16b,  v6.16b
  55         urhadd          v3.16b,  v3.16b,  v7.16b
  56         subs            w4,  w4,  #2
  57         urhadd          v16.16b, v16.16b, v20.16b
  58         urhadd          v17.16b, v17.16b, v21.16b
  59         st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
  60         urhadd          v18.16b, v18.16b, v22.16b
  61         urhadd          v19.16b, v19.16b, v23.16b
  62         st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
  63         b.ne            1b
  64         ret
  65 endfunc
  66
  67 function ff_vp9_copy32_aarch64, export=1
  68 1:
  69         ldp             x5,  x6,  [x2]
  70         ldp             x7,  x8,  [x2, #16]
  71         stp             x5,  x6,  [x0]
  72         subs            w4,  w4,  #1
  73         stp             x7,  x8,  [x0, #16]
  74         add             x2,  x2,  x3
  75         add             x0,  x0,  x1
  76         b.ne            1b
  77         ret
  78 endfunc
  79
  80 function ff_vp9_avg32_neon, export=1
  81 1:
  82         ld1             {v2.16b, v3.16b},  [x2], x3
  83         ld1             {v0.16b, v1.16b},  [x0]
  84         urhadd          v0.16b,  v0.16b,  v2.16b
  85         urhadd          v1.16b,  v1.16b,  v3.16b
  86         subs            w4,  w4,  #1
  87         st1             {v0.16b, v1.16b},  [x0], x1
  88         b.ne            1b
  89         ret
  90 endfunc
  91
  92 function ff_vp9_copy16_neon, export=1
  93         add             x5,  x0,  x1
  94         lsl             x1,  x1,  #1
  95         add             x6,  x2,  x3
  96         lsl             x3,  x3,  #1
  97 1:
  98         ld1             {v0.16b},  [x2], x3
  99         ld1             {v1.16b},  [x6], x3
 100         ld1             {v2.16b},  [x2], x3
 101         ld1             {v3.16b},  [x6], x3
 102         subs            w4,  w4,  #4
 103         st1             {v0.16b},  [x0], x1
 104         st1             {v1.16b},  [x5], x1
 105         st1             {v2.16b},  [x0], x1
 106         st1             {v3.16b},  [x5], x1
 107         b.ne            1b
 108         ret
 109 endfunc
 110
 111 function ff_vp9_avg16_neon, export=1
 112         mov             x5,  x0
 113 1:
 114         ld1             {v2.16b},  [x2], x3
 115         ld1             {v0.16b},  [x0], x1
 116         ld1             {v3.16b},  [x2], x3
 117         urhadd          v0.16b,  v0.16b,  v2.16b
 118         ld1             {v1.16b},  [x0], x1
 119         urhadd          v1.16b,  v1.16b,  v3.16b
 120         subs            w4,  w4,  #2
 121         st1             {v0.16b},  [x5], x1
 122         st1             {v1.16b},  [x5], x1
 123         b.ne            1b
 124         ret
 125 endfunc
 126
 127 function ff_vp9_copy8_neon, export=1
 128 1:
 129         ld1             {v0.8b},  [x2], x3
 130         ld1             {v1.8b},  [x2], x3
 131         subs            w4,  w4,  #2
 132         st1             {v0.8b},  [x0], x1
 133         st1             {v1.8b},  [x0], x1
 134         b.ne            1b
 135         ret
 136 endfunc
 137
 138 function ff_vp9_avg8_neon, export=1
 139         mov             x5,  x0
 140 1:
 141         ld1             {v2.8b},  [x2], x3
 142         ld1             {v0.8b},  [x0], x1
 143         ld1             {v3.8b},  [x2], x3
 144         urhadd          v0.8b,  v0.8b,  v2.8b
 145         ld1             {v1.8b},  [x0], x1
 146         urhadd          v1.8b,  v1.8b,  v3.8b
 147         subs            w4,  w4,  #2
 148         st1             {v0.8b},  [x5], x1
 149         st1             {v1.8b},  [x5], x1
 150         b.ne            1b
 151         ret
 152 endfunc
 153
 154 function ff_vp9_copy4_neon, export=1
 155 1:
 156         ld1             {v0.s}[0], [x2], x3
 157         ld1             {v1.s}[0], [x2], x3
 158         st1             {v0.s}[0], [x0], x1
 159         ld1             {v2.s}[0], [x2], x3
 160         st1             {v1.s}[0], [x0], x1
 161         ld1             {v3.s}[0], [x2], x3
 162         subs            w4,  w4,  #4
 163         st1             {v2.s}[0], [x0], x1
 164         st1             {v3.s}[0], [x0], x1
 165         b.ne            1b
 166         ret
 167 endfunc
 168
 169 function ff_vp9_avg4_neon, export=1
 170         mov             x5,  x0
 171 1:
 172         ld1             {v2.s}[0], [x2], x3
 173         ld1             {v0.s}[0], [x0], x1
 174         ld1             {v2.s}[1], [x2], x3
 175         ld1             {v0.s}[1], [x0], x1
 176         ld1             {v3.s}[0], [x2], x3
 177         ld1             {v1.s}[0], [x0], x1
 178         ld1             {v3.s}[1], [x2], x3
 179         ld1             {v1.s}[1], [x0], x1
 180         subs            w4,  w4,  #4
 181         urhadd          v0.8b,  v0.8b,  v2.8b
 182         urhadd          v1.8b,  v1.8b,  v3.8b
 183         st1             {v0.s}[0], [x5], x1
 184         st1             {v0.s}[1], [x5], x1
 185         st1             {v1.s}[0], [x5], x1
 186         st1             {v1.s}[1], [x5], x1
 187         b.ne            1b
 188         ret
 189 endfunc
 190
 191
 192 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
 193 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
 194 // dst1-dst2 and dst3-dst4 for size >= 16)
 195 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 196         ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
 197         ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
 198 .if \size >= 16
 199         mla             \dst1\().8h, v20.8h, v0.h[\offset]
 200         ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
 201         mla             \dst3\().8h, v22.8h, v0.h[\offset]
 202         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 203         mla             \dst2\().8h, v21.8h, v0.h[\offset]
 204         mla             \dst4\().8h, v23.8h, v0.h[\offset]
 205 .elseif \size == 8
 206         mla             \dst1\().8h, v20.8h, v0.h[\offset]
 207         mla             \dst3\().8h, v22.8h, v0.h[\offset]
 208 .else
 209         mla             \dst1\().4h, v20.4h, v0.h[\offset]
 210         mla             \dst3\().4h, v22.4h, v0.h[\offset]
 211 .endif
 212 .endm
 213 // The same as above, but don't accumulate straight into the
 214 // destination, but use a temp register and accumulate with saturation.
 215 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 216         ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
 217         ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
 218 .if \size >= 16
 219         mul             v20.8h, v20.8h, v0.h[\offset]
 220         ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
 221         mul             v22.8h, v22.8h, v0.h[\offset]
 222         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 223         mul             v21.8h, v21.8h, v0.h[\offset]
 224         mul             v23.8h, v23.8h, v0.h[\offset]
 225 .elseif \size == 8
 226         mul             v20.8h, v20.8h, v0.h[\offset]
 227         mul             v22.8h, v22.8h, v0.h[\offset]
 228 .else
 229         mul             v20.4h, v20.4h, v0.h[\offset]
 230         mul             v22.4h, v22.4h, v0.h[\offset]
 231 .endif
 232 .if \size == 4
 233         sqadd           \dst1\().4h, \dst1\().4h, v20.4h
 234         sqadd           \dst3\().4h, \dst3\().4h, v22.4h
 235 .else
 236         sqadd           \dst1\().8h, \dst1\().8h, v20.8h
 237         sqadd           \dst3\().8h, \dst3\().8h, v22.8h
 238 .if \size >= 16
 239         sqadd           \dst2\().8h, \dst2\().8h, v21.8h
 240         sqadd           \dst4\().8h, \dst4\().8h, v23.8h
 241 .endif
 242 .endif
 243 .endm
 244
 245
 246 // Instantiate a horizontal filter function for the given size.
 247 // This can work on 4, 8 or 16 pixels in parallel; for larger
 248 // widths it will do 16 pixels at a time and loop horizontally.
 249 // The actual width is passed in x5, the height in w4 and the
 250 // filter coefficients in x9. idx2 is the index of the largest
 251 // filter coefficient (3 or 4) and idx1 is the other one of them.
 252 .macro do_8tap_h type, size, idx1, idx2
 253 function \type\()_8tap_\size\()h_\idx1\idx2
 254         sub             x2,  x2,  #3
 255         add             x6,  x0,  x1
 256         add             x7,  x2,  x3
 257         add             x1,  x1,  x1
 258         add             x3,  x3,  x3
 259         // Only size >= 16 loops horizontally and needs
 260         // reduced dst stride
 261 .if \size >= 16
 262         sub             x1,  x1,  x5
 263 .endif
 264         // size >= 16 loads two qwords and increments x2,
 265         // for size 4/8 it's enough with one qword and no
 266         // postincrement
 267 .if \size >= 16
 268         sub             x3,  x3,  x5
 269         sub             x3,  x3,  #8
 270 .endif
 271         // Load the filter vector
 272         ld1             {v0.8h},  [x9]
 273 1:
 274 .if \size >= 16
 275         mov             x9,  x5
 276 .endif
 277         // Load src
 278 .if \size >= 16
 279         ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
 280         ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
 281 .else
 282         ld1             {v4.8b,  v5.8b},  [x2]
 283         ld1             {v16.8b, v17.8b}, [x7]
 284 .endif
 285         uxtl            v4.8h,  v4.8b
 286         uxtl            v5.8h,  v5.8b
 287         uxtl            v16.8h, v16.8b
 288         uxtl            v17.8h, v17.8b
 289 .if \size >= 16
 290         uxtl            v6.8h,  v6.8b
 291         uxtl            v18.8h, v18.8b
 292 .endif
 293 2:
 294
 295         // Accumulate, adding idx2 last with a separate
 296         // saturating add. The positive filter coefficients
 297         // for all indices except idx2 must add up to less
 298         // than 127 for this not to overflow.
 299         mul             v1.8h,  v4.8h,  v0.h[0]
 300         mul             v24.8h, v16.8h, v0.h[0]
 301 .if \size >= 16
 302         mul             v2.8h,  v5.8h,  v0.h[0]
 303         mul             v25.8h, v17.8h, v0.h[0]
 304 .endif
 305         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 1,     \size
 306         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 2,     \size
 307         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx1, \size
 308         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 5,     \size
 309         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 6,     \size
 310         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 7,     \size
 311         extmulqadd      v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx2, \size
 312
 313         // Round, shift and saturate
 314         sqrshrun        v1.8b,   v1.8h,  #7
 315         sqrshrun        v24.8b,  v24.8h, #7
 316 .if \size >= 16
 317         sqrshrun2       v1.16b,  v2.8h,  #7
 318         sqrshrun2       v24.16b, v25.8h, #7
 319 .endif
 320         // Average
 321 .ifc \type,avg
 322 .if \size >= 16
 323         ld1             {v2.16b}, [x0]
 324         ld1             {v3.16b}, [x6]
 325         urhadd          v1.16b,  v1.16b,  v2.16b
 326         urhadd          v24.16b, v24.16b, v3.16b
 327 .elseif \size == 8
 328         ld1             {v2.8b},  [x0]
 329         ld1             {v3.8b},  [x6]
 330         urhadd          v1.8b,  v1.8b,  v2.8b
 331         urhadd          v24.8b, v24.8b, v3.8b
 332 .else
 333         ld1             {v2.s}[0], [x0]
 334         ld1             {v3.s}[0], [x6]
 335         urhadd          v1.8b,  v1.8b,  v2.8b
 336         urhadd          v24.8b, v24.8b, v3.8b
 337 .endif
 338 .endif
 339         // Store and loop horizontally (for size >= 16)
 340 .if \size >= 16
 341         subs            x9,  x9,  #16
 342         st1             {v1.16b},  [x0], #16
 343         st1             {v24.16b}, [x6], #16
 344         b.eq            3f
 345         mov             v4.16b,  v6.16b
 346         mov             v16.16b, v18.16b
 347         ld1             {v6.16b},  [x2], #16
 348         ld1             {v18.16b}, [x7], #16
 349         uxtl            v5.8h,  v6.8b
 350         uxtl2           v6.8h,  v6.16b
 351         uxtl            v17.8h, v18.8b
 352         uxtl2           v18.8h, v18.16b
 353         b               2b
 354 .elseif \size == 8
 355         st1             {v1.8b},    [x0]
 356         st1             {v24.8b},   [x6]
 357 .else // \size == 4
 358         st1             {v1.s}[0],  [x0]
 359         st1             {v24.s}[0], [x6]
 360 .endif
 361 3:
 362         // Loop vertically
 363         add             x0,  x0,  x1
 364         add             x6,  x6,  x1
 365         add             x2,  x2,  x3
 366         add             x7,  x7,  x3
 367         subs            w4,  w4,  #2
 368         b.ne            1b
 369         ret
 370 endfunc
 371 .endm
 372
 373 .macro do_8tap_h_size size
 374 do_8tap_h put, \size, 3, 4
 375 do_8tap_h avg, \size, 3, 4
 376 do_8tap_h put, \size, 4, 3
 377 do_8tap_h avg, \size, 4, 3
 378 .endm
 379
 380 do_8tap_h_size 4
 381 do_8tap_h_size 8
 382 do_8tap_h_size 16
 383
 384 .macro do_8tap_h_func type, filter, offset, size
 385 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
 386         movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
 387         cmp             w5,  #8
 388         add             x9,  x6,  w5, uxtw #4
 389         mov             x5,  #\size
 390 .if \size >= 16
 391         b.ge            \type\()_8tap_16h_34
 392         b               \type\()_8tap_16h_43
 393 .else
 394         b.ge            \type\()_8tap_\size\()h_34
 395         b               \type\()_8tap_\size\()h_43
 396 .endif
 397 endfunc
 398 .endm
 399
 400 .macro do_8tap_h_filters size
 401 do_8tap_h_func put, regular, 1, \size
 402 do_8tap_h_func avg, regular, 1, \size
 403 do_8tap_h_func put, sharp,   2, \size
 404 do_8tap_h_func avg, sharp,   2, \size
 405 do_8tap_h_func put, smooth,  0, \size
 406 do_8tap_h_func avg, smooth,  0, \size
 407 .endm
 408
 409 do_8tap_h_filters 64
 410 do_8tap_h_filters 32
 411 do_8tap_h_filters 16
 412 do_8tap_h_filters 8
 413 do_8tap_h_filters 4
 414
 415
 416 // Vertical filters
 417
 418 // Round, shift and saturate and store reg1-reg2 over 4 lines
 419 .macro do_store4 reg1, reg2, tmp1, tmp2, type
 420         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 421         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 422 .ifc \type,avg
 423         ld1             {\tmp1\().s}[0],  [x7], x1
 424         ld1             {\tmp2\().s}[0],  [x7], x1
 425         ld1             {\tmp1\().s}[1],  [x7], x1
 426         ld1             {\tmp2\().s}[1],  [x7], x1
 427         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 428         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 429 .endif
 430         st1             {\reg1\().s}[0],  [x0], x1
 431         st1             {\reg2\().s}[0],  [x0], x1
 432         st1             {\reg1\().s}[1],  [x0], x1
 433         st1             {\reg2\().s}[1],  [x0], x1
 434 .endm
 435
 436 // Round, shift and saturate and store reg1-4
 437 .macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
 438         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 439         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 440         sqrshrun        \reg3\().8b,  \reg3\().8h, #7
 441         sqrshrun        \reg4\().8b,  \reg4\().8h, #7
 442 .ifc \type,avg
 443         ld1             {\tmp1\().8b},  [x7], x1
 444         ld1             {\tmp2\().8b},  [x7], x1
 445         ld1             {\tmp3\().8b},  [x7], x1
 446         ld1             {\tmp4\().8b},  [x7], x1
 447         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 448         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 449         urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
 450         urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
 451 .endif
 452         st1             {\reg1\().8b},  [x0], x1
 453         st1             {\reg2\().8b},  [x0], x1
 454         st1             {\reg3\().8b},  [x0], x1
 455         st1             {\reg4\().8b},  [x0], x1
 456 .endm
 457
 458 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
 459 // (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
 460 // at the end with saturation. Indices 0 and 7 always have negative or zero
 461 // coefficients, so they can be accumulated into tmp1-tmp2 together with the
 462 // largest coefficient.
 463 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
 464         mul             \dst1\().8h, \src2\().8h, v0.h[1]
 465         mul             \dst2\().8h, \src3\().8h, v0.h[1]
 466         mul             \tmp1\().8h, \src1\().8h, v0.h[0]
 467         mul             \tmp2\().8h, \src2\().8h, v0.h[0]
 468         mla             \dst1\().8h, \src3\().8h, v0.h[2]
 469         mla             \dst2\().8h, \src4\().8h, v0.h[2]
 470 .if \idx1 == 3
 471         mla             \dst1\().8h, \src4\().8h, v0.h[3]
 472         mla             \dst2\().8h, \src5\().8h, v0.h[3]
 473 .else
 474         mla             \dst1\().8h, \src5\().8h, v0.h[4]
 475         mla             \dst2\().8h, \src6\().8h, v0.h[4]
 476 .endif
 477         mla             \dst1\().8h, \src6\().8h, v0.h[5]
 478         mla             \dst2\().8h, \src7\().8h, v0.h[5]
 479         mla             \tmp1\().8h, \src8\().8h, v0.h[7]
 480         mla             \tmp2\().8h, \src9\().8h, v0.h[7]
 481         mla             \dst1\().8h, \src7\().8h, v0.h[6]
 482         mla             \dst2\().8h, \src8\().8h, v0.h[6]
 483 .if \idx2 == 3
 484         mla             \tmp1\().8h, \src4\().8h, v0.h[3]
 485         mla             \tmp2\().8h, \src5\().8h, v0.h[3]
 486 .else
 487         mla             \tmp1\().8h, \src5\().8h, v0.h[4]
 488         mla             \tmp2\().8h, \src6\().8h, v0.h[4]
 489 .endif
 490         sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
 491         sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
 492 .endm
 493
 494 // Load pixels and extend them to 16 bit
 495 .macro loadl dst1, dst2, dst3, dst4
 496         ld1             {v1.8b}, [x2], x3
 497         ld1             {v2.8b}, [x2], x3
 498         ld1             {v3.8b}, [x2], x3
 499 .ifnb \dst4
 500         ld1             {v4.8b}, [x2], x3
 501 .endif
 502         uxtl            \dst1\().8h, v1.8b
 503         uxtl            \dst2\().8h, v2.8b
 504         uxtl            \dst3\().8h, v3.8b
 505 .ifnb \dst4
 506         uxtl            \dst4\().8h, v4.8b
 507 .endif
 508 .endm
 509
 510 // Instantiate a vertical filter function for filtering 8 pixels at a time.
 511 // The height is passed in x4, the width in x5 and the filter coefficients
 512 // in x6. idx2 is the index of the largest filter coefficient (3 or 4)
 513 // and idx1 is the other one of them.
 514 .macro do_8tap_8v type, idx1, idx2
 515 function \type\()_8tap_8v_\idx1\idx2
 516         sub             x2,  x2,  x3, lsl #1
 517         sub             x2,  x2,  x3
 518         ld1             {v0.8h},  [x6]
 519 1:
 520 .ifc \type,avg
 521         mov             x7,  x0
 522 .endif
 523         mov             x6,  x4
 524
 525         loadl           v17, v18, v19
 526
 527         loadl           v20, v21, v22, v23
 528 2:
 529         loadl           v24, v25, v26, v27
 530         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
 531         convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
 532         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 533
 534         subs            x6,  x6,  #4
 535         b.eq            8f
 536
 537         loadl           v16, v17, v18, v19
 538         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
 539         convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
 540         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 541
 542         subs            x6,  x6,  #4
 543         b.eq            8f
 544
 545         loadl           v20, v21, v22, v23
 546         convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
 547         convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
 548         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 549
 550         subs            x6,  x6,  #4
 551         b.ne            2b
 552
 553 8:
 554         subs            x5,  x5,  #8
 555         b.eq            9f
 556         // x0 -= h * dst_stride
 557         msub            x0,  x1,  x4, x0
 558         // x2 -= h * src_stride
 559         msub            x2,  x3,  x4, x2
 560         // x2 -= 8 * src_stride
 561         sub             x2,  x2,  x3, lsl #3
 562         // x2 += 1 * src_stride
 563         add             x2,  x2,  x3
 564         add             x2,  x2,  #8
 565         add             x0,  x0,  #8
 566         b               1b
 567 9:
 568         ret
 569 endfunc
 570 .endm
 571
 572 do_8tap_8v put, 3, 4
 573 do_8tap_8v put, 4, 3
 574 do_8tap_8v avg, 3, 4
 575 do_8tap_8v avg, 4, 3
 576
 577
 578 // Instantiate a vertical filter function for filtering a 4 pixels wide
 579 // slice. The first half of the registers contain one row, while the second
 580 // half of a register contains the second-next row (also stored in the first
 581 // half of the register two steps ahead). The convolution does two outputs
 582 // at a time; the output of v17-v24 into one, and v18-v25 into another one.
 583 // The first half of first output is the first output row, the first half
 584 // of the other output is the second output row. The second halves of the
 585 // registers are rows 3 and 4.
 586 // This only is designed to work for 4 or 8 output lines.
 587 .macro do_8tap_4v type, idx1, idx2
 588 function \type\()_8tap_4v_\idx1\idx2
 589         sub             x2,  x2,  x3, lsl #1
 590         sub             x2,  x2,  x3
 591         ld1             {v0.8h},  [x6]
 592 .ifc \type,avg
 593         mov             x7,  x0
 594 .endif
 595
 596         ld1             {v1.s}[0],  [x2], x3
 597         ld1             {v2.s}[0],  [x2], x3
 598         ld1             {v3.s}[0],  [x2], x3
 599         ld1             {v4.s}[0],  [x2], x3
 600         ld1             {v5.s}[0],  [x2], x3
 601         ld1             {v6.s}[0],  [x2], x3
 602         trn1            v1.2s,  v1.2s,  v3.2s
 603         ld1             {v7.s}[0],  [x2], x3
 604         trn1            v2.2s,  v2.2s,  v4.2s
 605         ld1             {v26.s}[0], [x2], x3
 606         uxtl            v17.8h, v1.8b
 607         trn1            v3.2s,  v3.2s,  v5.2s
 608         ld1             {v27.s}[0], [x2], x3
 609         uxtl            v18.8h, v2.8b
 610         trn1            v4.2s,  v4.2s,  v6.2s
 611         ld1             {v28.s}[0], [x2], x3
 612         uxtl            v19.8h, v3.8b
 613         trn1            v5.2s,  v5.2s,  v7.2s
 614         ld1             {v29.s}[0], [x2], x3
 615         uxtl            v20.8h, v4.8b
 616         trn1            v6.2s,  v6.2s,  v26.2s
 617         uxtl            v21.8h, v5.8b
 618         trn1            v7.2s,  v7.2s,  v27.2s
 619         uxtl            v22.8h, v6.8b
 620         trn1            v26.2s, v26.2s, v28.2s
 621         uxtl            v23.8h, v7.8b
 622         trn1            v27.2s, v27.2s, v29.2s
 623         uxtl            v24.8h, v26.8b
 624         uxtl            v25.8h, v27.8b
 625
 626         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
 627         do_store4       v1,  v2,  v5,  v6,  \type
 628
 629         subs            x4,  x4,  #4
 630         b.eq            9f
 631
 632         ld1             {v1.s}[0],  [x2], x3
 633         ld1             {v2.s}[0],  [x2], x3
 634         trn1            v28.2s, v28.2s, v1.2s
 635         trn1            v29.2s, v29.2s, v2.2s
 636         ld1             {v1.s}[1],  [x2], x3
 637         uxtl            v26.8h, v28.8b
 638         ld1             {v2.s}[1],  [x2], x3
 639         uxtl            v27.8h, v29.8b
 640         uxtl            v28.8h, v1.8b
 641         uxtl            v29.8h, v2.8b
 642
 643         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
 644         do_store4       v1,  v2,  v5,  v6,  \type
 645
 646 9:
 647         ret
 648 endfunc
 649 .endm
 650
 651 do_8tap_4v put, 3, 4
 652 do_8tap_4v put, 4, 3
 653 do_8tap_4v avg, 3, 4
 654 do_8tap_4v avg, 4, 3
 655
 656
 657 .macro do_8tap_v_func type, filter, offset, size
 658 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
 659         uxtw            x4,  w4
 660         movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
 661         cmp             w6,  #8
 662         add             x6,  x5,  w6, uxtw #4
 663         mov             x5,  #\size
 664 .if \size >= 8
 665         b.ge            \type\()_8tap_8v_34
 666         b               \type\()_8tap_8v_43
 667 .else
 668         b.ge            \type\()_8tap_4v_34
 669         b               \type\()_8tap_4v_43
 670 .endif
 671 endfunc
 672 .endm
 673
 674 .macro do_8tap_v_filters size
 675 do_8tap_v_func put, regular, 1, \size
 676 do_8tap_v_func avg, regular, 1, \size
 677 do_8tap_v_func put, sharp,   2, \size
 678 do_8tap_v_func avg, sharp,   2, \size
 679 do_8tap_v_func put, smooth,  0, \size
 680 do_8tap_v_func avg, smooth,  0, \size
 681 .endm
 682
 683 do_8tap_v_filters 64
 684 do_8tap_v_filters 32
 685 do_8tap_v_filters 16
 686 do_8tap_v_filters 8
 687 do_8tap_v_filters 4