git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9mc_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22
  23 // All public functions in this file have the following signature:
  24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
  25 //                            const uint8_t *ref, ptrdiff_t ref_stride,
  26 //                            int h, int mx, int my);
  27
  28 function ff_vp9_avg64_neon, export=1
  29         mov             x5,  x0
  30 1:
  31         ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
  32         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
  33         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
  34         urhadd          v0.16b,  v0.16b,  v4.16b
  35         urhadd          v1.16b,  v1.16b,  v5.16b
  36         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
  37         urhadd          v2.16b,  v2.16b,  v6.16b
  38         urhadd          v3.16b,  v3.16b,  v7.16b
  39         subs            w4,  w4,  #2
  40         urhadd          v16.16b, v16.16b, v20.16b
  41         urhadd          v17.16b, v17.16b, v21.16b
  42         st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
  43         urhadd          v18.16b, v18.16b, v22.16b
  44         urhadd          v19.16b, v19.16b, v23.16b
  45         st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
  46         b.ne            1b
  47         ret
  48 endfunc
  49
  50 function ff_vp9_avg32_neon, export=1
  51 1:
  52         ld1             {v2.16b, v3.16b},  [x2], x3
  53         ld1             {v0.16b, v1.16b},  [x0]
  54         urhadd          v0.16b,  v0.16b,  v2.16b
  55         urhadd          v1.16b,  v1.16b,  v3.16b
  56         subs            w4,  w4,  #1
  57         st1             {v0.16b, v1.16b},  [x0], x1
  58         b.ne            1b
  59         ret
  60 endfunc
  61
  62 function ff_vp9_copy16_neon, export=1
  63         add             x5,  x0,  x1
  64         lsl             x1,  x1,  #1
  65         add             x6,  x2,  x3
  66         lsl             x3,  x3,  #1
  67 1:
  68         ld1             {v0.16b},  [x2], x3
  69         ld1             {v1.16b},  [x6], x3
  70         ld1             {v2.16b},  [x2], x3
  71         ld1             {v3.16b},  [x6], x3
  72         subs            w4,  w4,  #4
  73         st1             {v0.16b},  [x0], x1
  74         st1             {v1.16b},  [x5], x1
  75         st1             {v2.16b},  [x0], x1
  76         st1             {v3.16b},  [x5], x1
  77         b.ne            1b
  78         ret
  79 endfunc
  80
  81 function ff_vp9_avg16_neon, export=1
  82         mov             x5,  x0
  83 1:
  84         ld1             {v2.16b},  [x2], x3
  85         ld1             {v0.16b},  [x0], x1
  86         ld1             {v3.16b},  [x2], x3
  87         urhadd          v0.16b,  v0.16b,  v2.16b
  88         ld1             {v1.16b},  [x0], x1
  89         urhadd          v1.16b,  v1.16b,  v3.16b
  90         subs            w4,  w4,  #2
  91         st1             {v0.16b},  [x5], x1
  92         st1             {v1.16b},  [x5], x1
  93         b.ne            1b
  94         ret
  95 endfunc
  96
  97 function ff_vp9_copy8_neon, export=1
  98 1:
  99         ld1             {v0.8b},  [x2], x3
 100         ld1             {v1.8b},  [x2], x3
 101         subs            w4,  w4,  #2
 102         st1             {v0.8b},  [x0], x1
 103         st1             {v1.8b},  [x0], x1
 104         b.ne            1b
 105         ret
 106 endfunc
 107
 108 function ff_vp9_avg8_neon, export=1
 109         mov             x5,  x0
 110 1:
 111         ld1             {v2.8b},  [x2], x3
 112         ld1             {v0.8b},  [x0], x1
 113         ld1             {v3.8b},  [x2], x3
 114         urhadd          v0.8b,  v0.8b,  v2.8b
 115         ld1             {v1.8b},  [x0], x1
 116         urhadd          v1.8b,  v1.8b,  v3.8b
 117         subs            w4,  w4,  #2
 118         st1             {v0.8b},  [x5], x1
 119         st1             {v1.8b},  [x5], x1
 120         b.ne            1b
 121         ret
 122 endfunc
 123
 124 function ff_vp9_copy4_neon, export=1
 125 1:
 126         ld1             {v0.s}[0], [x2], x3
 127         ld1             {v1.s}[0], [x2], x3
 128         st1             {v0.s}[0], [x0], x1
 129         ld1             {v2.s}[0], [x2], x3
 130         st1             {v1.s}[0], [x0], x1
 131         ld1             {v3.s}[0], [x2], x3
 132         subs            w4,  w4,  #4
 133         st1             {v2.s}[0], [x0], x1
 134         st1             {v3.s}[0], [x0], x1
 135         b.ne            1b
 136         ret
 137 endfunc
 138
 139 function ff_vp9_avg4_neon, export=1
 140         mov             x5,  x0
 141 1:
 142         ld1             {v2.s}[0], [x2], x3
 143         ld1             {v0.s}[0], [x0], x1
 144         ld1             {v2.s}[1], [x2], x3
 145         ld1             {v0.s}[1], [x0], x1
 146         ld1             {v3.s}[0], [x2], x3
 147         ld1             {v1.s}[0], [x0], x1
 148         ld1             {v3.s}[1], [x2], x3
 149         ld1             {v1.s}[1], [x0], x1
 150         subs            w4,  w4,  #4
 151         urhadd          v0.8b,  v0.8b,  v2.8b
 152         urhadd          v1.8b,  v1.8b,  v3.8b
 153         st1             {v0.s}[0], [x5], x1
 154         st1             {v0.s}[1], [x5], x1
 155         st1             {v1.s}[0], [x5], x1
 156         st1             {v1.s}[1], [x5], x1
 157         b.ne            1b
 158         ret
 159 endfunc
 160
 161
 162 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
 163 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
 164 // dst1-dst2 and dst3-dst4 for size >= 16)
 165 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 166         ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
 167         ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
 168 .if \size >= 16
 169         mla             \dst1\().8h, v20.8h, v0.h[\offset]
 170         ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
 171         mla             \dst3\().8h, v22.8h, v0.h[\offset]
 172         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 173         mla             \dst2\().8h, v21.8h, v0.h[\offset]
 174         mla             \dst4\().8h, v23.8h, v0.h[\offset]
 175 .elseif \size == 8
 176         mla             \dst1\().8h, v20.8h, v0.h[\offset]
 177         mla             \dst3\().8h, v22.8h, v0.h[\offset]
 178 .else
 179         mla             \dst1\().4h, v20.4h, v0.h[\offset]
 180         mla             \dst3\().4h, v22.4h, v0.h[\offset]
 181 .endif
 182 .endm
 183 // The same as above, but don't accumulate straight into the
 184 // destination, but use a temp register and accumulate with saturation.
 185 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
 186         ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
 187         ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
 188 .if \size >= 16
 189         mul             v20.8h, v20.8h, v0.h[\offset]
 190         ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
 191         mul             v22.8h, v22.8h, v0.h[\offset]
 192         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 193         mul             v21.8h, v21.8h, v0.h[\offset]
 194         mul             v23.8h, v23.8h, v0.h[\offset]
 195 .elseif \size == 8
 196         mul             v20.8h, v20.8h, v0.h[\offset]
 197         mul             v22.8h, v22.8h, v0.h[\offset]
 198 .else
 199         mul             v20.4h, v20.4h, v0.h[\offset]
 200         mul             v22.4h, v22.4h, v0.h[\offset]
 201 .endif
 202 .if \size == 4
 203         sqadd           \dst1\().4h, \dst1\().4h, v20.4h
 204         sqadd           \dst3\().4h, \dst3\().4h, v22.4h
 205 .else
 206         sqadd           \dst1\().8h, \dst1\().8h, v20.8h
 207         sqadd           \dst3\().8h, \dst3\().8h, v22.8h
 208 .if \size >= 16
 209         sqadd           \dst2\().8h, \dst2\().8h, v21.8h
 210         sqadd           \dst4\().8h, \dst4\().8h, v23.8h
 211 .endif
 212 .endif
 213 .endm
 214
 215
 216 // Instantiate a horizontal filter function for the given size.
 217 // This can work on 4, 8 or 16 pixels in parallel; for larger
 218 // widths it will do 16 pixels at a time and loop horizontally.
 219 // The actual width is passed in x5, the height in w4 and the
 220 // filter coefficients in x9. idx2 is the index of the largest
 221 // filter coefficient (3 or 4) and idx1 is the other one of them.
 222 .macro do_8tap_h type, size, idx1, idx2
 223 function \type\()_8tap_\size\()h_\idx1\idx2
 224         sub             x2,  x2,  #3
 225         add             x6,  x0,  x1
 226         add             x7,  x2,  x3
 227         add             x1,  x1,  x1
 228         add             x3,  x3,  x3
 229         // Only size >= 16 loops horizontally and needs
 230         // reduced dst stride
 231 .if \size >= 16
 232         sub             x1,  x1,  x5
 233 .endif
 234         // size >= 16 loads two qwords and increments x2,
 235         // for size 4/8 it's enough with one qword and no
 236         // postincrement
 237 .if \size >= 16
 238         sub             x3,  x3,  x5
 239         sub             x3,  x3,  #8
 240 .endif
 241         // Load the filter vector
 242         ld1             {v0.8h},  [x9]
 243 1:
 244 .if \size >= 16
 245         mov             x9,  x5
 246 .endif
 247         // Load src
 248 .if \size >= 16
 249         ld1             {v4.8b,  v5.8b,  v6.8b},  [x2], #24
 250         ld1             {v16.8b, v17.8b, v18.8b}, [x7], #24
 251 .else
 252         ld1             {v4.8b,  v5.8b},  [x2]
 253         ld1             {v16.8b, v17.8b}, [x7]
 254 .endif
 255         uxtl            v4.8h,  v4.8b
 256         uxtl            v5.8h,  v5.8b
 257         uxtl            v16.8h, v16.8b
 258         uxtl            v17.8h, v17.8b
 259 .if \size >= 16
 260         uxtl            v6.8h,  v6.8b
 261         uxtl            v18.8h, v18.8b
 262 .endif
 263 2:
 264
 265         // Accumulate, adding idx2 last with a separate
 266         // saturating add. The positive filter coefficients
 267         // for all indices except idx2 must add up to less
 268         // than 127 for this not to overflow.
 269         mul             v1.8h,  v4.8h,  v0.h[0]
 270         mul             v24.8h, v16.8h, v0.h[0]
 271 .if \size >= 16
 272         mul             v2.8h,  v5.8h,  v0.h[0]
 273         mul             v25.8h, v17.8h, v0.h[0]
 274 .endif
 275         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 1,     \size
 276         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 2,     \size
 277         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx1, \size
 278         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 5,     \size
 279         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 6,     \size
 280         extmla          v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 7,     \size
 281         extmulqadd      v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, \idx2, \size
 282
 283         // Round, shift and saturate
 284         sqrshrun        v1.8b,   v1.8h,  #7
 285         sqrshrun        v24.8b,  v24.8h, #7
 286 .if \size >= 16
 287         sqrshrun2       v1.16b,  v2.8h,  #7
 288         sqrshrun2       v24.16b, v25.8h, #7
 289 .endif
 290         // Average
 291 .ifc \type,avg
 292 .if \size >= 16
 293         ld1             {v2.16b}, [x0]
 294         ld1             {v3.16b}, [x6]
 295         urhadd          v1.16b,  v1.16b,  v2.16b
 296         urhadd          v24.16b, v24.16b, v3.16b
 297 .elseif \size == 8
 298         ld1             {v2.8b},  [x0]
 299         ld1             {v3.8b},  [x6]
 300         urhadd          v1.8b,  v1.8b,  v2.8b
 301         urhadd          v24.8b, v24.8b, v3.8b
 302 .else
 303         ld1             {v2.s}[0], [x0]
 304         ld1             {v3.s}[0], [x6]
 305         urhadd          v1.8b,  v1.8b,  v2.8b
 306         urhadd          v24.8b, v24.8b, v3.8b
 307 .endif
 308 .endif
 309         // Store and loop horizontally (for size >= 16)
 310 .if \size >= 16
 311         subs            x9,  x9,  #16
 312         st1             {v1.16b},  [x0], #16
 313         st1             {v24.16b}, [x6], #16
 314         b.eq            3f
 315         mov             v4.16b,  v6.16b
 316         mov             v16.16b, v18.16b
 317         ld1             {v6.16b},  [x2], #16
 318         ld1             {v18.16b}, [x7], #16
 319         uxtl            v5.8h,  v6.8b
 320         uxtl2           v6.8h,  v6.16b
 321         uxtl            v17.8h, v18.8b
 322         uxtl2           v18.8h, v18.16b
 323         b               2b
 324 .elseif \size == 8
 325         st1             {v1.8b},    [x0]
 326         st1             {v24.8b},   [x6]
 327 .else // \size == 4
 328         st1             {v1.s}[0],  [x0]
 329         st1             {v24.s}[0], [x6]
 330 .endif
 331 3:
 332         // Loop vertically
 333         add             x0,  x0,  x1
 334         add             x6,  x6,  x1
 335         add             x2,  x2,  x3
 336         add             x7,  x7,  x3
 337         subs            w4,  w4,  #2
 338         b.ne            1b
 339         ret
 340 endfunc
 341 .endm
 342
 343 .macro do_8tap_h_size size
 344 do_8tap_h put, \size, 3, 4
 345 do_8tap_h avg, \size, 3, 4
 346 do_8tap_h put, \size, 4, 3
 347 do_8tap_h avg, \size, 4, 3
 348 .endm
 349
 350 do_8tap_h_size 4
 351 do_8tap_h_size 8
 352 do_8tap_h_size 16
 353
 354 .macro do_8tap_h_func type, filter, offset, size
 355 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
 356         movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
 357         cmp             w5,  #8
 358         add             x9,  x6,  w5, uxtw #4
 359         mov             x5,  #\size
 360 .if \size >= 16
 361         b.ge            \type\()_8tap_16h_34
 362         b               \type\()_8tap_16h_43
 363 .else
 364         b.ge            \type\()_8tap_\size\()h_34
 365         b               \type\()_8tap_\size\()h_43
 366 .endif
 367 endfunc
 368 .endm
 369
 370 .macro do_8tap_h_filters size
 371 do_8tap_h_func put, regular, 1, \size
 372 do_8tap_h_func avg, regular, 1, \size
 373 do_8tap_h_func put, sharp,   2, \size
 374 do_8tap_h_func avg, sharp,   2, \size
 375 do_8tap_h_func put, smooth,  0, \size
 376 do_8tap_h_func avg, smooth,  0, \size
 377 .endm
 378
 379 do_8tap_h_filters 64
 380 do_8tap_h_filters 32
 381 do_8tap_h_filters 16
 382 do_8tap_h_filters 8
 383 do_8tap_h_filters 4
 384
 385
 386 // Vertical filters
 387
 388 // Round, shift and saturate and store reg1-reg2 over 4 lines
 389 .macro do_store4 reg1, reg2, tmp1, tmp2, type
 390         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 391         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 392 .ifc \type,avg
 393         ld1             {\tmp1\().s}[0],  [x7], x1
 394         ld1             {\tmp2\().s}[0],  [x7], x1
 395         ld1             {\tmp1\().s}[1],  [x7], x1
 396         ld1             {\tmp2\().s}[1],  [x7], x1
 397         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 398         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 399 .endif
 400         st1             {\reg1\().s}[0],  [x0], x1
 401         st1             {\reg2\().s}[0],  [x0], x1
 402         st1             {\reg1\().s}[1],  [x0], x1
 403         st1             {\reg2\().s}[1],  [x0], x1
 404 .endm
 405
 406 // Round, shift and saturate and store reg1-4
 407 .macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
 408         sqrshrun        \reg1\().8b,  \reg1\().8h, #7
 409         sqrshrun        \reg2\().8b,  \reg2\().8h, #7
 410         sqrshrun        \reg3\().8b,  \reg3\().8h, #7
 411         sqrshrun        \reg4\().8b,  \reg4\().8h, #7
 412 .ifc \type,avg
 413         ld1             {\tmp1\().8b},  [x7], x1
 414         ld1             {\tmp2\().8b},  [x7], x1
 415         ld1             {\tmp3\().8b},  [x7], x1
 416         ld1             {\tmp4\().8b},  [x7], x1
 417         urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
 418         urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
 419         urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
 420         urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
 421 .endif
 422         st1             {\reg1\().8b},  [x0], x1
 423         st1             {\reg2\().8b},  [x0], x1
 424         st1             {\reg3\().8b},  [x0], x1
 425         st1             {\reg4\().8b},  [x0], x1
 426 .endm
 427
 428 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
 429 // (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
 430 // at the end with saturation. Indices 0 and 7 always have negative or zero
 431 // coefficients, so they can be accumulated into tmp1-tmp2 together with the
 432 // largest coefficient.
 433 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
 434         mul             \dst1\().8h, \src2\().8h, v0.h[1]
 435         mul             \dst2\().8h, \src3\().8h, v0.h[1]
 436         mul             \tmp1\().8h, \src1\().8h, v0.h[0]
 437         mul             \tmp2\().8h, \src2\().8h, v0.h[0]
 438         mla             \dst1\().8h, \src3\().8h, v0.h[2]
 439         mla             \dst2\().8h, \src4\().8h, v0.h[2]
 440 .if \idx1 == 3
 441         mla             \dst1\().8h, \src4\().8h, v0.h[3]
 442         mla             \dst2\().8h, \src5\().8h, v0.h[3]
 443 .else
 444         mla             \dst1\().8h, \src5\().8h, v0.h[4]
 445         mla             \dst2\().8h, \src6\().8h, v0.h[4]
 446 .endif
 447         mla             \dst1\().8h, \src6\().8h, v0.h[5]
 448         mla             \dst2\().8h, \src7\().8h, v0.h[5]
 449         mla             \tmp1\().8h, \src8\().8h, v0.h[7]
 450         mla             \tmp2\().8h, \src9\().8h, v0.h[7]
 451         mla             \dst1\().8h, \src7\().8h, v0.h[6]
 452         mla             \dst2\().8h, \src8\().8h, v0.h[6]
 453 .if \idx2 == 3
 454         mla             \tmp1\().8h, \src4\().8h, v0.h[3]
 455         mla             \tmp2\().8h, \src5\().8h, v0.h[3]
 456 .else
 457         mla             \tmp1\().8h, \src5\().8h, v0.h[4]
 458         mla             \tmp2\().8h, \src6\().8h, v0.h[4]
 459 .endif
 460         sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
 461         sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
 462 .endm
 463
 464 // Load pixels and extend them to 16 bit
 465 .macro loadl dst1, dst2, dst3, dst4
 466         ld1             {v1.8b}, [x2], x3
 467         ld1             {v2.8b}, [x2], x3
 468         ld1             {v3.8b}, [x2], x3
 469 .ifnb \dst4
 470         ld1             {v4.8b}, [x2], x3
 471 .endif
 472         uxtl            \dst1\().8h, v1.8b
 473         uxtl            \dst2\().8h, v2.8b
 474         uxtl            \dst3\().8h, v3.8b
 475 .ifnb \dst4
 476         uxtl            \dst4\().8h, v4.8b
 477 .endif
 478 .endm
 479
 480 // Instantiate a vertical filter function for filtering 8 pixels at a time.
 481 // The height is passed in x4, the width in x5 and the filter coefficients
 482 // in x6. idx2 is the index of the largest filter coefficient (3 or 4)
 483 // and idx1 is the other one of them.
 484 .macro do_8tap_8v type, idx1, idx2
 485 function \type\()_8tap_8v_\idx1\idx2
 486         sub             x2,  x2,  x3, lsl #1
 487         sub             x2,  x2,  x3
 488         ld1             {v0.8h},  [x6]
 489 1:
 490 .ifc \type,avg
 491         mov             x7,  x0
 492 .endif
 493         mov             x6,  x4
 494
 495         loadl           v17, v18, v19
 496
 497         loadl           v20, v21, v22, v23
 498 2:
 499         loadl           v24, v25, v26, v27
 500         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
 501         convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
 502         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 503
 504         subs            x6,  x6,  #4
 505         b.eq            8f
 506
 507         loadl           v16, v17, v18, v19
 508         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
 509         convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
 510         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 511
 512         subs            x6,  x6,  #4
 513         b.eq            8f
 514
 515         loadl           v20, v21, v22, v23
 516         convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
 517         convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
 518         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
 519
 520         subs            x6,  x6,  #4
 521         b.ne            2b
 522
 523 8:
 524         subs            x5,  x5,  #8
 525         b.eq            9f
 526         // x0 -= h * dst_stride
 527         msub            x0,  x1,  x4, x0
 528         // x2 -= h * src_stride
 529         msub            x2,  x3,  x4, x2
 530         // x2 -= 8 * src_stride
 531         sub             x2,  x2,  x3, lsl #3
 532         // x2 += 1 * src_stride
 533         add             x2,  x2,  x3
 534         add             x2,  x2,  #8
 535         add             x0,  x0,  #8
 536         b               1b
 537 9:
 538         ret
 539 endfunc
 540 .endm
 541
 542 do_8tap_8v put, 3, 4
 543 do_8tap_8v put, 4, 3
 544 do_8tap_8v avg, 3, 4
 545 do_8tap_8v avg, 4, 3
 546
 547
 548 // Instantiate a vertical filter function for filtering a 4 pixels wide
 549 // slice. The first half of the registers contain one row, while the second
 550 // half of a register contains the second-next row (also stored in the first
 551 // half of the register two steps ahead). The convolution does two outputs
 552 // at a time; the output of v17-v24 into one, and v18-v25 into another one.
 553 // The first half of first output is the first output row, the first half
 554 // of the other output is the second output row. The second halves of the
 555 // registers are rows 3 and 4.
 556 // This only is designed to work for 4 or 8 output lines.
 557 .macro do_8tap_4v type, idx1, idx2
 558 function \type\()_8tap_4v_\idx1\idx2
 559         sub             x2,  x2,  x3, lsl #1
 560         sub             x2,  x2,  x3
 561         ld1             {v0.8h},  [x6]
 562 .ifc \type,avg
 563         mov             x7,  x0
 564 .endif
 565
 566         ld1             {v1.s}[0],  [x2], x3
 567         ld1             {v2.s}[0],  [x2], x3
 568         ld1             {v3.s}[0],  [x2], x3
 569         ld1             {v4.s}[0],  [x2], x3
 570         ld1             {v5.s}[0],  [x2], x3
 571         ld1             {v6.s}[0],  [x2], x3
 572         trn1            v1.2s,  v1.2s,  v3.2s
 573         ld1             {v7.s}[0],  [x2], x3
 574         trn1            v2.2s,  v2.2s,  v4.2s
 575         ld1             {v26.s}[0], [x2], x3
 576         uxtl            v17.8h, v1.8b
 577         trn1            v3.2s,  v3.2s,  v5.2s
 578         ld1             {v27.s}[0], [x2], x3
 579         uxtl            v18.8h, v2.8b
 580         trn1            v4.2s,  v4.2s,  v6.2s
 581         ld1             {v28.s}[0], [x2], x3
 582         uxtl            v19.8h, v3.8b
 583         trn1            v5.2s,  v5.2s,  v7.2s
 584         ld1             {v29.s}[0], [x2], x3
 585         uxtl            v20.8h, v4.8b
 586         trn1            v6.2s,  v6.2s,  v26.2s
 587         uxtl            v21.8h, v5.8b
 588         trn1            v7.2s,  v7.2s,  v27.2s
 589         uxtl            v22.8h, v6.8b
 590         trn1            v26.2s, v26.2s, v28.2s
 591         uxtl            v23.8h, v7.8b
 592         trn1            v27.2s, v27.2s, v29.2s
 593         uxtl            v24.8h, v26.8b
 594         uxtl            v25.8h, v27.8b
 595
 596         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
 597         do_store4       v1,  v2,  v5,  v6,  \type
 598
 599         subs            x4,  x4,  #4
 600         b.eq            9f
 601
 602         ld1             {v1.s}[0],  [x2], x3
 603         ld1             {v2.s}[0],  [x2], x3
 604         trn1            v28.2s, v28.2s, v1.2s
 605         trn1            v29.2s, v29.2s, v2.2s
 606         ld1             {v1.s}[1],  [x2], x3
 607         uxtl            v26.8h, v28.8b
 608         ld1             {v2.s}[1],  [x2], x3
 609         uxtl            v27.8h, v29.8b
 610         uxtl            v28.8h, v1.8b
 611         uxtl            v29.8h, v2.8b
 612
 613         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
 614         do_store4       v1,  v2,  v5,  v6,  \type
 615
 616 9:
 617         ret
 618 endfunc
 619 .endm
 620
 621 do_8tap_4v put, 3, 4
 622 do_8tap_4v put, 4, 3
 623 do_8tap_4v avg, 3, 4
 624 do_8tap_4v avg, 4, 3
 625
 626
 627 .macro do_8tap_v_func type, filter, offset, size
 628 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
 629         uxtw            x4,  w4
 630         movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
 631         cmp             w6,  #8
 632         add             x6,  x5,  w6, uxtw #4
 633         mov             x5,  #\size
 634 .if \size >= 8
 635         b.ge            \type\()_8tap_8v_34
 636         b               \type\()_8tap_8v_43
 637 .else
 638         b.ge            \type\()_8tap_4v_34
 639         b               \type\()_8tap_4v_43
 640 .endif
 641 endfunc
 642 .endm
 643
 644 .macro do_8tap_v_filters size
 645 do_8tap_v_func put, regular, 1, \size
 646 do_8tap_v_func avg, regular, 1, \size
 647 do_8tap_v_func put, sharp,   2, \size
 648 do_8tap_v_func avg, sharp,   2, \size
 649 do_8tap_v_func put, smooth,  0, \size
 650 do_8tap_v_func avg, smooth,  0, \size
 651 .endm
 652
 653 do_8tap_v_filters 64
 654 do_8tap_v_filters 32
 655 do_8tap_v_filters 16
 656 do_8tap_v_filters 8
 657 do_8tap_v_filters 4