git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9lpf_16bpp_neon.S

   1 /*
   2  * Copyright (c) 2017 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22 #include "neon.S"
  23
  24
  25 .macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
  26         trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
  27         trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
  28         trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
  29         trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
  30
  31         trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
  32         trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
  33         trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
  34         trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
  35 .endm
  36
  37 // The input to and output from this macro is in the registers v16-v31,
  38 // and v0-v7 are used as scratch registers.
  39 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
  40 // Depending on the width of the loop filter, we either use v16-v19
  41 // and v28-v31 as temp registers, or v8-v15.
  42 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
  43         dup             v0.8h,  w2                   // E
  44         dup             v2.8h,  w3                   // I
  45         dup             v3.8h,  w4                   // H
  46
  47         uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
  48         uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
  49         uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
  50         uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
  51         uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
  52         uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
  53         umax            v4.8h,  v4.8h,  v5.8h
  54         umax            v5.8h,  v6.8h,  v7.8h
  55         umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
  56         uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
  57         umax            v4.8h,  v4.8h,  v5.8h
  58         add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
  59         uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
  60         umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
  61         ushr            v5.8h,  v5.8h,  #1
  62         cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
  63         add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
  64         cmhs            v6.8h,  v0.8h,  v6.8h
  65         and             v4.16b, v4.16b, v6.16b       // fm
  66
  67         // If no pixels need filtering, just exit as soon as possible
  68         mov             x11, v4.d[0]
  69         mov             x12, v4.d[1]
  70         adds            x11, x11, x12
  71         b.ne            1f
  72         br              x10
  73 1:
  74
  75 .if \wd >= 8
  76         dup             v0.8h,  w5
  77
  78         uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
  79         uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
  80         uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
  81         uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
  82         uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
  83         uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
  84         umax            v6.8h,  v6.8h,  v2.8h
  85         umax            v1.8h,  v1.8h,  \tmp1\().8h
  86         umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
  87 .if \wd == 16
  88         uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
  89         umax            v6.8h,  v6.8h,  v1.8h
  90         uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
  91         umax            v6.8h,  v6.8h,  \tmp2\().8h
  92         uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
  93         cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
  94         uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
  95         and             v6.16b, v6.16b, v4.16b       // flat8in && fm
  96         uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
  97         bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
  98         uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
  99         uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
 100         uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
 101
 102         umax            v7.8h,  v7.8h,  v2.8h
 103         umax            v1.8h,  v1.8h,  v8.8h
 104         umax            v9.8h,  v9.8h,  v10.8h
 105         umax            v11.8h, v11.8h, v12.8h
 106         // The rest of the calculation of flat8out is interleaved below
 107 .else
 108         // The rest of the calculation of flat8in is interleaved below
 109 .endif
 110 .endif
 111
 112         // Calculate the normal inner loop filter for 2 or 4 pixels
 113         uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
 114 .if \wd == 16
 115         umax            v7.8h,  v7.8h,  v1.8h
 116         umax            v9.8h,  v9.8h,  v11.8h
 117 .elseif \wd == 8
 118         umax            v6.8h,  v6.8h,  v1.8h
 119 .endif
 120         uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
 121 .if \wd == 16
 122         umax            v7.8h,  v7.8h,  v9.8h
 123 .elseif \wd == 8
 124         umax            v6.8h,  v6.8h,  \tmp2\().8h
 125 .endif
 126         dup             \tmp2\().8h,  w6                        // left shift for saturation
 127         sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
 128         neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
 129         umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
 130         sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
 131         movi            \tmp5\().8h,  #3
 132 .if \wd == 8
 133         cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
 134 .endif
 135         cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
 136 .if \wd == 8
 137         and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
 138 .endif
 139         sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
 140 .if \wd == 16
 141         cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
 142 .elseif \wd == 8
 143         bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
 144 .endif
 145         and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
 146 .if \wd == 16
 147         and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
 148 .endif
 149         sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
 150
 151         mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
 152         bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
 153         movi            v2.8h,  #4
 154         add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
 155         movi            v3.8h,  #3
 156         sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
 157         movi            \tmp5\().8h,  #0
 158         sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
 159         dup             \tmp6\().8h,  w7                        // max pixel value
 160 .if \wd == 16
 161         bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
 162 .endif
 163
 164         ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
 165
 166         add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
 167         add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
 168         smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
 169         smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
 170         sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
 171         sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
 172
 173         add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
 174         sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
 175         smin            v0.8h,   v0.8h,   \tmp6\().8h
 176         smin            v2.8h,   v2.8h,   \tmp6\().8h
 177         srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
 178         smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
 179         smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
 180         bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
 181         bit             v24.16b, v2.16b,  v4.16b
 182
 183         add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
 184         sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
 185 .if \wd >= 8
 186         mov             x11, v6.d[0]
 187 .endif
 188         smin            v0.8h,  v0.8h,  \tmp6\().8h
 189         smin            v2.8h,  v2.8h,  \tmp6\().8h
 190 .if \wd >= 8
 191         mov             x12, v6.d[1]
 192 .endif
 193         smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
 194         smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
 195 .if \wd >= 8
 196         adds            x11, x11, x12
 197 .endif
 198         bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
 199         bit             v25.16b, v2.16b,  v5.16b
 200
 201         // If no pixels need flat8in, jump to flat8out
 202         // (or to a writeout of the inner 4 pixels, for wd=8)
 203 .if \wd >= 8
 204 .if \wd == 16
 205         b.eq            6f
 206 .else
 207         b.ne            1f
 208         br              x13
 209 1:
 210 .endif
 211
 212         // flat8in
 213         add             \tmp1\().8h, v20.8h, v21.8h
 214         add             \tmp3\().8h, v22.8h, v25.8h
 215         add             \tmp5\().8h, v20.8h, v22.8h
 216         add             \tmp7\().8h, v23.8h, v26.8h
 217         add             v0.8h,  \tmp1\().8h, \tmp1\().8h
 218         add             v0.8h,  v0.8h,  v23.8h
 219         add             v0.8h,  v0.8h,  v24.8h
 220         add             v0.8h,  v0.8h,  \tmp5\().8h
 221         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
 222         sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
 223         urshr           v2.8h,  v0.8h,  #3                      // out p2
 224
 225         add             v0.8h,  v0.8h,  \tmp3\().8h
 226         add             \tmp1\().8h, v20.8h,  v23.8h
 227         add             \tmp3\().8h, v24.8h,  v27.8h
 228         urshr           v3.8h,  v0.8h,  #3                      // out p1
 229
 230         add             v0.8h,  v0.8h,  \tmp7\().8h
 231         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
 232         add             \tmp5\().8h, v21.8h,  v24.8h
 233         add             \tmp7\().8h, v25.8h,  v27.8h
 234         urshr           v4.8h,  v0.8h,  #3                      // out p0
 235
 236         add             v0.8h,  v0.8h,  \tmp3\().8h
 237         sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
 238         add             \tmp1\().8h, v22.8h,  v25.8h
 239         add             \tmp3\().8h, v26.8h,  v27.8h
 240         urshr           v5.8h,  v0.8h,  #3                      // out q0
 241
 242         add             v0.8h,  v0.8h,  \tmp7\().8h
 243         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
 244         urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
 245
 246         add             v0.8h,  v0.8h,  \tmp3\().8h
 247         // The output here is written back into the input registers. This doesn't
 248         // matter for the flat8part below, since we only update those pixels
 249         // which won't be touched below.
 250         bit             v21.16b, v2.16b,  v6.16b
 251         bit             v22.16b, v3.16b,  v6.16b
 252         bit             v23.16b, v4.16b,  v6.16b
 253         urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
 254         bit             v24.16b, v5.16b,  v6.16b
 255         bit             v25.16b, \tmp5\().16b,  v6.16b
 256         bit             v26.16b, \tmp6\().16b,  v6.16b
 257 .endif
 258 .if \wd == 16
 259 6:
 260         orr             v2.16b,  v6.16b,  v7.16b
 261         mov             x11, v2.d[0]
 262         mov             x12, v2.d[1]
 263         adds            x11, x11, x12
 264         b.ne            1f
 265         // If no pixels needed flat8in nor flat8out, jump to a
 266         // writeout of the inner 4 pixels
 267         br              x14
 268 1:
 269
 270         mov             x11, v7.d[0]
 271         mov             x12, v7.d[1]
 272         adds            x11, x11, x12
 273         b.ne            1f
 274         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
 275         br              x15
 276
 277 1:
 278         // flat8out
 279         // This writes all outputs into v2-v17 (skipping v6 and v16).
 280         // If this part is skipped, the output is read from v21-v26 (which is the input
 281         // to this section).
 282         shl             v0.8h,   v16.8h,  #3     // 8 * v16
 283         sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
 284         add             v0.8h,   v0.8h,   v17.8h
 285         add             v8.8h,   v17.8h,  v18.8h
 286         add             v10.8h,  v19.8h,  v20.8h
 287         add             v0.8h,   v0.8h,   v8.8h
 288         add             v8.8h,   v16.8h,  v17.8h
 289         add             v12.8h,  v21.8h,  v22.8h
 290         add             v0.8h,   v0.8h,   v10.8h
 291         add             v10.8h,  v18.8h,  v25.8h
 292         add             v14.8h,  v23.8h,  v24.8h
 293         sub             v10.8h,  v10.8h,  v8.8h
 294         add             v0.8h,   v0.8h,   v12.8h
 295         add             v0.8h,   v0.8h,   v14.8h
 296         add             v12.8h,  v16.8h,  v18.8h
 297         add             v14.8h,  v19.8h,  v26.8h
 298         urshr           v2.8h,   v0.8h,   #4
 299
 300         add             v0.8h,   v0.8h,   v10.8h
 301         add             v8.8h,   v16.8h,  v19.8h
 302         add             v10.8h,  v20.8h,  v27.8h
 303         sub             v14.8h,  v14.8h,  v12.8h
 304         bif             v2.16b,  v17.16b, v7.16b
 305         urshr           v3.8h ,  v0.8h,   #4
 306
 307         add             v0.8h,   v0.8h,   v14.8h
 308         add             v12.8h,  v16.8h,  v20.8h
 309         add             v14.8h,  v21.8h,  v28.8h
 310         sub             v10.8h,  v10.8h,  v8.8h
 311         bif             v3.16b,  v18.16b, v7.16b
 312         urshr           v4.8h,   v0.8h,   #4
 313
 314         add             v0.8h,   v0.8h,   v10.8h
 315         add             v8.8h,   v16.8h,  v21.8h
 316         add             v10.8h,  v22.8h,  v29.8h
 317         sub             v14.8h,  v14.8h,  v12.8h
 318         bif             v4.16b,  v19.16b, v7.16b
 319         urshr           v5.8h,   v0.8h,   #4
 320
 321         add             v0.8h,   v0.8h,   v14.8h
 322         add             v12.8h,  v16.8h,  v22.8h
 323         add             v14.8h,  v23.8h,  v30.8h
 324         sub             v10.8h,  v10.8h,  v8.8h
 325         bif             v5.16b,  v20.16b, v7.16b
 326         urshr           v6.8h,   v0.8h,   #4
 327
 328         add             v0.8h,   v0.8h,   v10.8h
 329         add             v10.8h,  v16.8h,  v23.8h
 330         sub             v14.8h,  v14.8h,  v12.8h
 331         add             v12.8h,  v24.8h,  v31.8h
 332         bif             v6.16b,  v21.16b, v7.16b
 333         urshr           v8.8h,   v0.8h,   #4
 334
 335         add             v0.8h,   v0.8h,   v14.8h
 336         sub             v10.8h,  v12.8h,  v10.8h
 337         add             v12.8h,  v17.8h,  v24.8h
 338         add             v14.8h,  v25.8h,  v31.8h
 339         bif             v8.16b,  v22.16b, v7.16b
 340         urshr           v9.8h,   v0.8h,   #4
 341
 342         add             v0.8h,   v0.8h,   v10.8h
 343         sub             v14.8h,  v14.8h,  v12.8h
 344         add             v12.8h,  v26.8h,  v31.8h
 345         bif             v9.16b,  v23.16b, v7.16b
 346         urshr           v10.8h,  v0.8h,   #4
 347
 348         add             v0.8h,   v0.8h,   v14.8h
 349         add             v14.8h,  v18.8h,  v25.8h
 350         add             v18.8h,  v19.8h,  v26.8h
 351         sub             v12.8h,  v12.8h,  v14.8h
 352         add             v14.8h,  v27.8h,  v31.8h
 353         bif             v10.16b, v24.16b, v7.16b
 354         urshr           v11.8h,  v0.8h,   #4
 355
 356         add             v0.8h,   v0.8h,   v12.8h
 357         add             v12.8h,  v20.8h,  v27.8h
 358         sub             v14.8h,  v14.8h,  v18.8h
 359         add             v18.8h,  v28.8h,  v31.8h
 360         bif             v11.16b, v25.16b, v7.16b
 361         sub             v18.8h,  v18.8h,  v12.8h
 362         urshr           v12.8h,  v0.8h,   #4
 363
 364         add             v0.8h,   v0.8h,   v14.8h
 365         add             v14.8h,  v21.8h,  v28.8h
 366         add             v20.8h,  v29.8h,  v31.8h
 367         bif             v12.16b, v26.16b, v7.16b
 368         urshr           v13.8h,  v0.8h,   #4
 369
 370         add             v0.8h,   v0.8h,   v18.8h
 371         sub             v20.8h,  v20.8h,  v14.8h
 372         add             v18.8h,  v22.8h,  v29.8h
 373         add             v22.8h,  v30.8h,  v31.8h
 374         bif             v13.16b, v27.16b, v7.16b
 375         urshr           v14.8h,  v0.8h,   #4
 376
 377         add             v0.8h,   v0.8h,   v20.8h
 378         sub             v22.8h,  v22.8h,  v18.8h
 379         bif             v14.16b, v28.16b, v7.16b
 380         urshr           v15.8h,  v0.8h,   #4
 381
 382         add             v0.8h,   v0.8h,   v22.8h
 383         bif             v15.16b, v29.16b, v7.16b
 384         urshr           v17.8h,  v0.8h,   #4
 385         bif             v17.16b, v30.16b, v7.16b
 386 .endif
 387 .endm
 388
 389 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
 390 // while we need those for inputs/outputs in wd=16 and use v8-v15
 391 // for temp registers there instead.
 392 function vp9_loop_filter_4
 393         loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
 394         ret
 395 endfunc
 396
 397 function vp9_loop_filter_8
 398         loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
 399         ret
 400 endfunc
 401
 402 function vp9_loop_filter_16
 403         loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
 404         ret
 405 endfunc
 406
 407 .macro loop_filter_4
 408         bl              vp9_loop_filter_4
 409 .endm
 410
 411 .macro loop_filter_8
 412         // calculate alternative 'return' targets
 413         adr             x13, 6f
 414         bl              vp9_loop_filter_8
 415 .endm
 416
 417 .macro loop_filter_16
 418         // calculate alternative 'return' targets
 419         adr             x14, 7f
 420         adr             x15, 8f
 421         bl              vp9_loop_filter_16
 422 .endm
 423
 424
 425 // The public functions in this file have got the following signature:
 426 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
 427
 428 .macro bpp_frontend func, bpp, push
 429 function ff_\func\()_\bpp\()_neon, export=1
 430 .if \push
 431         mov             x16, x30
 432         stp             d14, d15, [sp, #-0x10]!
 433         stp             d12, d13, [sp, #-0x10]!
 434         stp             d10, d11, [sp, #-0x10]!
 435         stp             d8,  d9,  [sp, #-0x10]!
 436 .endif
 437         lsl             w2,  w2,  #\bpp - 8
 438         lsl             w3,  w3,  #\bpp - 8
 439         lsl             w4,  w4,  #\bpp - 8
 440         mov             x5,  #1 << (\bpp - 8)
 441         mov             x6,  #16 - \bpp
 442         mov             x7,  #((1 << \bpp) - 1)
 443 .if \push
 444         bl              \func\()_16_neon
 445         ldp             d8,  d9,  [sp], 0x10
 446         ldp             d10, d11, [sp], 0x10
 447         ldp             d12, d13, [sp], 0x10
 448         ldp             d14, d15, [sp], 0x10
 449         br              x16
 450 .else
 451         b               \func\()_16_neon
 452 .endif
 453 endfunc
 454 .endm
 455
 456 .macro bpp_frontends func, push=0
 457         bpp_frontend    \func, 10, \push
 458         bpp_frontend    \func, 12, \push
 459 .endm
 460
 461 .macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
 462 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
 463         mov             x16, x30
 464 .if \push
 465         stp             d14, d15, [sp, #-0x10]!
 466         stp             d12, d13, [sp, #-0x10]!
 467         stp             d10, d11, [sp, #-0x10]!
 468         stp             d8,  d9,  [sp, #-0x10]!
 469 .endif
 470         lsl             w2,  w2,  #\bpp - 8
 471         lsl             w3,  w3,  #\bpp - 8
 472         lsl             w4,  w4,  #\bpp - 8
 473         mov             x5,  #1 << (\bpp - 8)
 474         mov             x6,  #16 - \bpp
 475         mov             x7,  #((1 << \bpp) - 1)
 476         bl              \func\()_\int_suffix\()_16_neon
 477 .ifc \dir,h
 478         add             x0,  x0,  x1, lsl #3
 479 .else
 480         add             x0,  x0,  #16
 481 .endif
 482         bl              \func\()_\int_suffix\()_16_neon
 483 .if \push
 484         ldp             d8,  d9,  [sp], 0x10
 485         ldp             d10, d11, [sp], 0x10
 486         ldp             d12, d13, [sp], 0x10
 487         ldp             d14, d15, [sp], 0x10
 488 .endif
 489         br              x16
 490 endfunc
 491 .endm
 492
 493 .macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
 494         bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
 495         bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
 496 .endm
 497
 498 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
 499 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
 500         mov             x16, x30
 501         lsr             w8,  w2,  #8
 502         lsr             w14, w3,  #8
 503         lsr             w15, w4,  #8
 504         and             w2,  w2,  #0xff
 505         and             w3,  w3,  #0xff
 506         and             w4,  w4,  #0xff
 507         lsl             w2,  w2,  #\bpp - 8
 508         lsl             w3,  w3,  #\bpp - 8
 509         lsl             w4,  w4,  #\bpp - 8
 510         mov             x5,  #1 << (\bpp - 8)
 511         mov             x6,  #16 - \bpp
 512         mov             x7,  #((1 << \bpp) - 1)
 513         bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
 514 .ifc \dir,h
 515         add             x0,  x0,  x1, lsl #3
 516 .else
 517         add             x0,  x0,  #16
 518 .endif
 519         lsl             w2,  w8,  #\bpp - 8
 520         lsl             w3,  w14, #\bpp - 8
 521         lsl             w4,  w15, #\bpp - 8
 522         bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
 523         br              x16
 524 endfunc
 525 .endm
 526
 527 .macro bpp_frontends_mix2 wd1, wd2
 528         bpp_frontend_mix2 \wd1, \wd2, v, 10
 529         bpp_frontend_mix2 \wd1, \wd2, v, 12
 530         bpp_frontend_mix2 \wd1, \wd2, h, 10
 531         bpp_frontend_mix2 \wd1, \wd2, h, 12
 532 .endm
 533
 534 function vp9_loop_filter_v_4_8_16_neon
 535         mov             x10, x30
 536         sub             x9,  x0,  x1, lsl #2
 537         ld1             {v20.8h}, [x9], x1 // p3
 538         ld1             {v24.8h}, [x0], x1 // q0
 539         ld1             {v21.8h}, [x9], x1 // p2
 540         ld1             {v25.8h}, [x0], x1 // q1
 541         ld1             {v22.8h}, [x9], x1 // p1
 542         ld1             {v26.8h}, [x0], x1 // q2
 543         ld1             {v23.8h}, [x9], x1 // p0
 544         ld1             {v27.8h}, [x0], x1 // q3
 545         sub             x0,  x0,  x1, lsl #2
 546         sub             x9,  x9,  x1, lsl #1
 547
 548         loop_filter_4
 549
 550         st1             {v22.8h}, [x9], x1
 551         st1             {v24.8h}, [x0], x1
 552         st1             {v23.8h}, [x9], x1
 553         st1             {v25.8h}, [x0], x1
 554         sub             x0,  x0,  x1, lsl #1
 555
 556         br              x10
 557 endfunc
 558
 559 bpp_frontends vp9_loop_filter_v_4_8
 560
 561 function vp9_loop_filter_h_4_8_16_neon
 562         mov             x10, x30
 563         sub             x9,  x0,  #8
 564         add             x0,  x9,  x1, lsl #2
 565         ld1             {v20.8h}, [x9], x1
 566         ld1             {v24.8h}, [x0], x1
 567         ld1             {v21.8h}, [x9], x1
 568         ld1             {v25.8h}, [x0], x1
 569         ld1             {v22.8h}, [x9], x1
 570         ld1             {v26.8h}, [x0], x1
 571         ld1             {v23.8h}, [x9], x1
 572         ld1             {v27.8h}, [x0], x1
 573
 574         sub             x9,  x9,  x1, lsl #2
 575         sub             x0,  x0,  x1, lsl #3
 576         add             x0,  x0,  #8
 577
 578         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
 579
 580         loop_filter_4
 581
 582         // Move x9 forward by 2 pixels; we don't need to rewrite the
 583         // outermost 2 pixels since they aren't changed.
 584         add             x9,  x9,  #4
 585         add             x0,  x9,  x1, lsl #2
 586
 587         // We only will write the mid 4 pixels back; after the loop filter,
 588         // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
 589         // We need to transpose them to columns, done with a 4x8 transpose
 590         // (which in practice is two 4x4 transposes of the two 4x4 halves
 591         // of the 8x4 pixels; into 4x8 pixels).
 592         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
 593         st1             {v22.d}[0], [x9], x1
 594         st1             {v22.d}[1], [x0], x1
 595         st1             {v23.d}[0], [x9], x1
 596         st1             {v23.d}[1], [x0], x1
 597         st1             {v24.d}[0], [x9], x1
 598         st1             {v24.d}[1], [x0], x1
 599         st1             {v25.d}[0], [x9], x1
 600         st1             {v25.d}[1], [x0], x1
 601         sub             x0,  x0,  x1, lsl #3
 602         add             x0,  x0,  #4
 603
 604         br              x10
 605 endfunc
 606
 607 bpp_frontends vp9_loop_filter_h_4_8
 608
 609 function vp9_loop_filter_v_8_8_16_neon
 610         mov             x10, x30
 611         sub             x9,  x0,  x1, lsl #2
 612         ld1             {v20.8h}, [x9], x1 // p3
 613         ld1             {v24.8h}, [x0], x1 // q0
 614         ld1             {v21.8h}, [x9], x1 // p2
 615         ld1             {v25.8h}, [x0], x1 // q1
 616         ld1             {v22.8h}, [x9], x1 // p1
 617         ld1             {v26.8h}, [x0], x1 // q2
 618         ld1             {v23.8h}, [x9], x1 // p0
 619         ld1             {v27.8h}, [x0], x1 // q3
 620         sub             x9,  x9,  x1, lsl #2
 621         sub             x0,  x0,  x1, lsl #2
 622         add             x9,  x9,  x1
 623
 624         loop_filter_8
 625
 626         st1             {v21.8h}, [x9], x1
 627         st1             {v24.8h}, [x0], x1
 628         st1             {v22.8h}, [x9], x1
 629         st1             {v25.8h}, [x0], x1
 630         st1             {v23.8h}, [x9], x1
 631         st1             {v26.8h}, [x0], x1
 632         sub             x0,  x0,  x1, lsl #1
 633         sub             x0,  x0,  x1
 634
 635         br              x10
 636 6:
 637         sub             x9,  x0,  x1, lsl #1
 638         st1             {v22.8h}, [x9], x1
 639         st1             {v24.8h}, [x0], x1
 640         st1             {v23.8h}, [x9], x1
 641         st1             {v25.8h}, [x0], x1
 642         sub             x0,  x0,  x1, lsl #1
 643         br              x10
 644 endfunc
 645
 646 bpp_frontends vp9_loop_filter_v_8_8
 647
 648 function vp9_loop_filter_h_8_8_16_neon
 649         mov             x10, x30
 650         sub             x9,  x0,  #8
 651         add             x0,  x9,  x1, lsl #2
 652         ld1             {v20.8h}, [x9], x1
 653         ld1             {v24.8h}, [x0], x1
 654         ld1             {v21.8h}, [x9], x1
 655         ld1             {v25.8h}, [x0], x1
 656         ld1             {v22.8h}, [x9], x1
 657         ld1             {v26.8h}, [x0], x1
 658         ld1             {v23.8h}, [x9], x1
 659         ld1             {v27.8h}, [x0], x1
 660
 661         sub             x9,  x9,  x1, lsl #2
 662         sub             x0,  x0,  x1, lsl #3
 663         add             x0,  x0,  #8
 664
 665         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
 666
 667         loop_filter_8
 668
 669         add             x0,  x9,  x1, lsl #2
 670
 671         // Even though only 6 pixels per row have been changed, we write the
 672         // full 8 pixel registers.
 673         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
 674
 675         st1             {v20.8h}, [x9], x1
 676         st1             {v24.8h}, [x0], x1
 677         st1             {v21.8h}, [x9], x1
 678         st1             {v25.8h}, [x0], x1
 679         st1             {v22.8h}, [x9], x1
 680         st1             {v26.8h}, [x0], x1
 681         st1             {v23.8h}, [x9], x1
 682         st1             {v27.8h}, [x0], x1
 683         sub             x0,  x0,  x1, lsl #3
 684         add             x0,  x0,  #8
 685
 686         br              x10
 687 6:
 688         // If we didn't need to do the flat8in part, we use the same writeback
 689         // as in loop_filter_h_4_8.
 690         add             x9,  x9,  #4
 691         add             x0,  x9,  x1, lsl #2
 692         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
 693         st1             {v22.d}[0], [x9], x1
 694         st1             {v22.d}[1], [x0], x1
 695         st1             {v23.d}[0], [x9], x1
 696         st1             {v23.d}[1], [x0], x1
 697         st1             {v24.d}[0], [x9], x1
 698         st1             {v24.d}[1], [x0], x1
 699         st1             {v25.d}[0], [x9], x1
 700         st1             {v25.d}[1], [x0], x1
 701         sub             x0,  x0,  x1, lsl #3
 702         add             x0,  x0,  #4
 703         br              x10
 704 endfunc
 705
 706 bpp_frontends vp9_loop_filter_h_8_8
 707
 708 bpp_frontends_mix2 4, 4
 709 bpp_frontends_mix2 4, 8
 710 bpp_frontends_mix2 8, 4
 711 bpp_frontends_mix2 8, 8
 712
 713 function vp9_loop_filter_v_16_8_16_neon
 714         mov             x10, x30
 715         sub             x9,  x0,  x1, lsl #3
 716         ld1             {v16.8h}, [x9], x1 // p7
 717         ld1             {v24.8h}, [x0], x1 // q0
 718         ld1             {v17.8h}, [x9], x1 // p6
 719         ld1             {v25.8h}, [x0], x1 // q1
 720         ld1             {v18.8h}, [x9], x1 // p5
 721         ld1             {v26.8h}, [x0], x1 // q2
 722         ld1             {v19.8h}, [x9], x1 // p4
 723         ld1             {v27.8h}, [x0], x1 // q3
 724         ld1             {v20.8h}, [x9], x1 // p3
 725         ld1             {v28.8h}, [x0], x1 // q4
 726         ld1             {v21.8h}, [x9], x1 // p2
 727         ld1             {v29.8h}, [x0], x1 // q5
 728         ld1             {v22.8h}, [x9], x1 // p1
 729         ld1             {v30.8h}, [x0], x1 // q6
 730         ld1             {v23.8h}, [x9], x1 // p0
 731         ld1             {v31.8h}, [x0], x1 // q7
 732         sub             x9,  x9,  x1, lsl #3
 733         sub             x0,  x0,  x1, lsl #3
 734         add             x9,  x9,  x1
 735
 736         loop_filter_16
 737
 738         // If we did the flat8out part, we get the output in
 739         // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
 740         // store v2-v9 there, and v10-v17 into x0.
 741         st1             {v2.8h},  [x9], x1
 742         st1             {v10.8h}, [x0], x1
 743         st1             {v3.8h},  [x9], x1
 744         st1             {v11.8h}, [x0], x1
 745         st1             {v4.8h},  [x9], x1
 746         st1             {v12.8h}, [x0], x1
 747         st1             {v5.8h},  [x9], x1
 748         st1             {v13.8h}, [x0], x1
 749         st1             {v6.8h},  [x9], x1
 750         st1             {v14.8h}, [x0], x1
 751         st1             {v8.8h},  [x9], x1
 752         st1             {v15.8h}, [x0], x1
 753         st1             {v9.8h},  [x9], x1
 754         st1             {v17.8h}, [x0], x1
 755         sub             x0,  x0,  x1, lsl #3
 756         add             x0,  x0,  x1
 757
 758         br              x10
 759 8:
 760         add             x9,  x9,  x1, lsl #2
 761         // If we didn't do the flat8out part, the output is left in the
 762         // input registers.
 763         st1             {v21.8h}, [x9], x1
 764         st1             {v24.8h}, [x0], x1
 765         st1             {v22.8h}, [x9], x1
 766         st1             {v25.8h}, [x0], x1
 767         st1             {v23.8h}, [x9], x1
 768         st1             {v26.8h}, [x0], x1
 769         sub             x0,  x0,  x1, lsl #1
 770         sub             x0,  x0,  x1
 771         br              x10
 772 7:
 773         sub             x9,  x0,  x1, lsl #1
 774         st1             {v22.8h}, [x9], x1
 775         st1             {v24.8h}, [x0], x1
 776         st1             {v23.8h}, [x9], x1
 777         st1             {v25.8h}, [x0], x1
 778         sub             x0,  x0,  x1, lsl #1
 779         br              x10
 780 endfunc
 781
 782 bpp_frontends vp9_loop_filter_v_16_8, push=1
 783 bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
 784
 785 function vp9_loop_filter_h_16_8_16_neon
 786         mov             x10, x30
 787         sub             x9,  x0,  #16
 788         ld1             {v16.8h}, [x9], x1
 789         ld1             {v24.8h}, [x0], x1
 790         ld1             {v17.8h}, [x9], x1
 791         ld1             {v25.8h}, [x0], x1
 792         ld1             {v18.8h}, [x9], x1
 793         ld1             {v26.8h}, [x0], x1
 794         ld1             {v19.8h}, [x9], x1
 795         ld1             {v27.8h}, [x0], x1
 796         ld1             {v20.8h}, [x9], x1
 797         ld1             {v28.8h}, [x0], x1
 798         ld1             {v21.8h}, [x9], x1
 799         ld1             {v29.8h}, [x0], x1
 800         ld1             {v22.8h}, [x9], x1
 801         ld1             {v30.8h}, [x0], x1
 802         ld1             {v23.8h}, [x9], x1
 803         ld1             {v31.8h}, [x0], x1
 804         sub             x0,  x0,  x1, lsl #3
 805         sub             x9,  x9,  x1, lsl #3
 806
 807         // The 16x8 pixels read above is in two 8x8 blocks; the left
 808         // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
 809         // of this, to get one column per register.
 810         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
 811         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
 812
 813         loop_filter_16
 814
 815         transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
 816         transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
 817
 818         st1             {v16.8h}, [x9], x1
 819         st1             {v10.8h}, [x0], x1
 820         st1             {v2.8h},  [x9], x1
 821         st1             {v11.8h}, [x0], x1
 822         st1             {v3.8h},  [x9], x1
 823         st1             {v12.8h}, [x0], x1
 824         st1             {v4.8h},  [x9], x1
 825         st1             {v13.8h}, [x0], x1
 826         st1             {v5.8h},  [x9], x1
 827         st1             {v14.8h}, [x0], x1
 828         st1             {v6.8h},  [x9], x1
 829         st1             {v15.8h}, [x0], x1
 830         st1             {v8.8h},  [x9], x1
 831         st1             {v17.8h}, [x0], x1
 832         st1             {v9.8h},  [x9], x1
 833         st1             {v31.8h}, [x0], x1
 834         sub             x0,  x0,  x1, lsl #3
 835
 836         br              x10
 837 8:
 838         // The same writeback as in loop_filter_h_8_8
 839         sub             x9,  x0,  #8
 840         add             x0,  x9,  x1, lsl #2
 841         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
 842
 843         st1             {v20.8h}, [x9], x1
 844         st1             {v24.8h}, [x0], x1
 845         st1             {v21.8h}, [x9], x1
 846         st1             {v25.8h}, [x0], x1
 847         st1             {v22.8h}, [x9], x1
 848         st1             {v26.8h}, [x0], x1
 849         st1             {v23.8h}, [x9], x1
 850         st1             {v27.8h}, [x0], x1
 851         sub             x0,  x0,  x1, lsl #3
 852         add             x0,  x0,  #8
 853         br              x10
 854 7:
 855         // The same writeback as in loop_filter_h_4_8
 856         sub             x9,  x0,  #4
 857         add             x0,  x9,  x1, lsl #2
 858         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
 859         st1             {v22.d}[0], [x9], x1
 860         st1             {v22.d}[1], [x0], x1
 861         st1             {v23.d}[0], [x9], x1
 862         st1             {v23.d}[1], [x0], x1
 863         st1             {v24.d}[0], [x9], x1
 864         st1             {v24.d}[1], [x0], x1
 865         st1             {v25.d}[0], [x9], x1
 866         st1             {v25.d}[1], [x0], x1
 867         sub             x0,  x0,  x1, lsl #3
 868         add             x0,  x0,  #4
 869         br              x10
 870 endfunc
 871
 872 bpp_frontends vp9_loop_filter_h_16_8, push=1
 873 bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1