git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22 #include "neon.S"
  23
  24 const itxfm4_coeffs, align=4
  25         .short  11585, 6270, 15137, 0
  26 iadst4_coeffs:
  27         .short  5283, 15212, 9929, 13377
  28 endconst
  29
  30 const iadst8_coeffs, align=4
  31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
  32 idct_coeffs:
  33         .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
  34         .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
  35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
  36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
  37 endconst
  38
  39 const iadst16_coeffs, align=4
  40         .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
  41         .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
  42 endconst
  43
  44 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
  45 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
  46 // in/out are .8h registers; this can do with 4 temp registers, but is
  47 // more efficient if 6 temp registers are available.
  48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
  49 .if \neg > 0
  50         neg             \tmp4\().4h, v0.4h
  51 .endif
  52         add             \tmp1\().8h, \in1\().8h,  \in2\().8h
  53         sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
  54 .if \neg > 0
  55         smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
  56         smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
  57 .else
  58         smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
  59         smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
  60 .endif
  61 .ifb \tmp5
  62         rshrn           \out1\().4h, \tmp3\().4s, #14
  63         rshrn2          \out1\().8h, \tmp4\().4s, #14
  64         smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
  65         smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
  66         rshrn           \out2\().4h, \tmp3\().4s, #14
  67         rshrn2          \out2\().8h, \tmp4\().4s, #14
  68 .else
  69         smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
  70         smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
  71         rshrn           \out1\().4h, \tmp3\().4s, #14
  72         rshrn2          \out1\().8h, \tmp4\().4s, #14
  73         rshrn           \out2\().4h, \tmp5\().4s, #14
  74         rshrn2          \out2\().8h, \tmp6\().4s, #14
  75 .endif
  76 .endm
  77
  78 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
  79 // writing the same output into both out1 and out2.
  80 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
  81         smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
  82         smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
  83         rshrn           \out1\().4h,  \tmp1\().4s, #14
  84         rshrn2          \out1\().8h,  \tmp2\().4s, #14
  85         rshrn           \out2\().4h,  \tmp1\().4s, #14
  86         rshrn2          \out2\().8h,  \tmp2\().4s, #14
  87 .endm
  88
  89 // out1,out2 = in1 * coef1 - in2 * coef2
  90 // out3,out4 = in1 * coef2 + in2 * coef1
  91 // out are 4 x .4s registers, in are 2 x .8h registers
  92 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
  93         smull           \out1\().4s, \in1\().4h, \coef1
  94         smull2          \out2\().4s, \in1\().8h, \coef1
  95         smull           \out3\().4s, \in1\().4h, \coef2
  96         smull2          \out4\().4s, \in1\().8h, \coef2
  97         smlsl           \out1\().4s, \in2\().4h, \coef2
  98         smlsl2          \out2\().4s, \in2\().8h, \coef2
  99         smlal           \out3\().4s, \in2\().4h, \coef1
 100         smlal2          \out4\().4s, \in2\().8h, \coef1
 101 .endm
 102
 103 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
 104 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
 105 // inout are 2 x .8h registers
 106 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
 107         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
 108 .if \neg > 0
 109         neg             \tmp3\().4s, \tmp3\().4s
 110         neg             \tmp4\().4s, \tmp4\().4s
 111 .endif
 112         rshrn           \inout1\().4h, \tmp1\().4s,  #14
 113         rshrn2          \inout1\().8h, \tmp2\().4s,  #14
 114         rshrn           \inout2\().4h, \tmp3\().4s,  #14
 115         rshrn2          \inout2\().8h, \tmp4\().4s,  #14
 116 .endm
 117
 118 // Same as dmbutterfly above, but treating the input in inout2 as zero
 119 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
 120         smull           \tmp1\().4s, \inout1\().4h, \coef1
 121         smull2          \tmp2\().4s, \inout1\().8h, \coef1
 122         smull           \tmp3\().4s, \inout1\().4h, \coef2
 123         smull2          \tmp4\().4s, \inout1\().8h, \coef2
 124         rshrn           \inout1\().4h, \tmp1\().4s, #14
 125         rshrn2          \inout1\().8h, \tmp2\().4s, #14
 126         rshrn           \inout2\().4h, \tmp3\().4s, #14
 127         rshrn2          \inout2\().8h, \tmp4\().4s, #14
 128 .endm
 129
 130 // Same as dmbutterfly above, but treating the input in inout1 as zero
 131 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
 132         smull           \tmp1\().4s, \inout2\().4h, \coef2
 133         smull2          \tmp2\().4s, \inout2\().8h, \coef2
 134         smull           \tmp3\().4s, \inout2\().4h, \coef1
 135         smull2          \tmp4\().4s, \inout2\().8h, \coef1
 136         neg             \tmp1\().4s, \tmp1\().4s
 137         neg             \tmp2\().4s, \tmp2\().4s
 138         rshrn           \inout2\().4h, \tmp3\().4s, #14
 139         rshrn2          \inout2\().8h, \tmp4\().4s, #14
 140         rshrn           \inout1\().4h, \tmp1\().4s, #14
 141         rshrn2          \inout1\().8h, \tmp2\().4s, #14
 142 .endm
 143
 144 .macro dsmull_h out1, out2, in, coef
 145         smull           \out1\().4s, \in\().4h, \coef
 146         smull2          \out2\().4s, \in\().8h, \coef
 147 .endm
 148
 149 .macro drshrn_h out, in1, in2, shift
 150         rshrn           \out\().4h, \in1\().4s, \shift
 151         rshrn2          \out\().8h, \in2\().4s, \shift
 152 .endm
 153
 154
 155 // out1 = in1 + in2
 156 // out2 = in1 - in2
 157 .macro butterfly_8h out1, out2, in1, in2
 158         add             \out1\().8h, \in1\().8h, \in2\().8h
 159         sub             \out2\().8h, \in1\().8h, \in2\().8h
 160 .endm
 161
 162 // out1 = in1 - in2
 163 // out2 = in1 + in2
 164 .macro butterfly_8h_r out1, out2, in1, in2
 165         sub             \out1\().8h, \in1\().8h, \in2\().8h
 166         add             \out2\().8h, \in1\().8h, \in2\().8h
 167 .endm
 168
 169 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
 170 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
 171 // out are 2 x .8h registers, in are 4 x .4s registers
 172 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
 173         add             \tmp1\().4s, \in1\().4s, \in3\().4s
 174         add             \tmp2\().4s, \in2\().4s, \in4\().4s
 175         sub             \tmp3\().4s, \in1\().4s, \in3\().4s
 176         sub             \tmp4\().4s, \in2\().4s, \in4\().4s
 177         rshrn           \out1\().4h, \tmp1\().4s,  #14
 178         rshrn2          \out1\().8h, \tmp2\().4s,  #14
 179         rshrn           \out2\().4h, \tmp3\().4s,  #14
 180         rshrn2          \out2\().8h, \tmp4\().4s,  #14
 181 .endm
 182
 183 .macro iwht4 c0, c1, c2, c3
 184         add             \c0\().4h, \c0\().4h, \c1\().4h
 185         sub             v17.4h,    \c2\().4h, \c3\().4h
 186         sub             v16.4h,    \c0\().4h, v17.4h
 187         sshr            v16.4h,    v16.4h,    #1
 188         sub             \c2\().4h, v16.4h,    \c1\().4h
 189         sub             \c1\().4h, v16.4h,    \c3\().4h
 190         add             \c3\().4h, v17.4h,    \c2\().4h
 191         sub             \c0\().4h, \c0\().4h, \c1\().4h
 192 .endm
 193
 194 .macro idct4 c0, c1, c2, c3
 195         smull           v22.4s,    \c1\().4h, v0.h[2]
 196         smull           v20.4s,    \c1\().4h, v0.h[1]
 197         add             v16.4h,    \c0\().4h, \c2\().4h
 198         sub             v17.4h,    \c0\().4h, \c2\().4h
 199         smlal           v22.4s,    \c3\().4h, v0.h[1]
 200         smull           v18.4s,    v16.4h,    v0.h[0]
 201         smull           v19.4s,    v17.4h,    v0.h[0]
 202         smlsl           v20.4s,    \c3\().4h, v0.h[2]
 203         rshrn           v22.4h,    v22.4s,    #14
 204         rshrn           v18.4h,    v18.4s,    #14
 205         rshrn           v19.4h,    v19.4s,    #14
 206         rshrn           v20.4h,    v20.4s,    #14
 207         add             \c0\().4h, v18.4h,    v22.4h
 208         sub             \c3\().4h, v18.4h,    v22.4h
 209         add             \c1\().4h, v19.4h,    v20.4h
 210         sub             \c2\().4h, v19.4h,    v20.4h
 211 .endm
 212
 213 .macro iadst4 c0, c1, c2, c3
 214         smull           v16.4s,    \c0\().4h, v0.h[4]
 215         smlal           v16.4s,    \c2\().4h, v0.h[5]
 216         smlal           v16.4s,    \c3\().4h, v0.h[6]
 217         smull           v17.4s,    \c0\().4h, v0.h[6]
 218         smlsl           v17.4s,    \c2\().4h, v0.h[4]
 219         sub             \c0\().4h, \c0\().4h, \c2\().4h
 220         smlsl           v17.4s,    \c3\().4h, v0.h[5]
 221         add             \c0\().4h, \c0\().4h, \c3\().4h
 222         smull           v19.4s,    \c1\().4h, v0.h[7]
 223         smull           v18.4s,    \c0\().4h, v0.h[7]
 224         add             v20.4s,    v16.4s,    v19.4s
 225         add             v21.4s,    v17.4s,    v19.4s
 226         rshrn           \c0\().4h, v20.4s,    #14
 227         add             v16.4s,    v16.4s,    v17.4s
 228         rshrn           \c1\().4h, v21.4s,     #14
 229         sub             v16.4s,    v16.4s,    v19.4s
 230         rshrn           \c2\().4h, v18.4s,    #14
 231         rshrn           \c3\().4h, v16.4s,    #14
 232 .endm
 233
 234 // The public functions in this file have got the following signature:
 235 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 236
 237 .macro itxfm_func4x4 txfm1, txfm2
 238 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 239 .ifc \txfm1,\txfm2
 240 .ifc \txfm1,idct
 241         movrel          x4,  itxfm4_coeffs
 242         ld1             {v0.4h}, [x4]
 243 .endif
 244 .ifc \txfm1,iadst
 245         movrel          x4,  iadst4_coeffs
 246         ld1             {v0.d}[1], [x4]
 247 .endif
 248 .else
 249         movrel          x4,  itxfm4_coeffs
 250         ld1             {v0.8h}, [x4]
 251 .endif
 252
 253         movi            v31.8h, #0
 254 .ifc \txfm1\()_\txfm2,idct_idct
 255         cmp             w3,  #1
 256         b.ne            1f
 257         // DC-only for idct/idct
 258         ld1r            {v2.4h},  [x2]
 259         smull           v2.4s,  v2.4h, v0.h[0]
 260         rshrn           v2.4h,  v2.4s, #14
 261         smull           v2.4s,  v2.4h, v0.h[0]
 262         rshrn           v2.4h,  v2.4s, #14
 263         st1             {v31.h}[0], [x2]
 264         dup             v4.4h,  v2.h[0]
 265         mov             v5.16b, v4.16b
 266         mov             v6.16b, v4.16b
 267         mov             v7.16b, v4.16b
 268         b               2f
 269 .endif
 270
 271 1:
 272         ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
 273         st1             {v31.8h}, [x2], #16
 274
 275 .ifc \txfm1,iwht
 276         sshr            v4.4h,  v4.4h,  #2
 277         sshr            v5.4h,  v5.4h,  #2
 278         sshr            v6.4h,  v6.4h,  #2
 279         sshr            v7.4h,  v7.4h,  #2
 280 .endif
 281
 282         \txfm1\()4      v4,  v5,  v6,  v7
 283
 284         st1             {v31.8h}, [x2], #16
 285         // Transpose 4x4 with 16 bit elements
 286         transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 287
 288         \txfm2\()4      v4,  v5,  v6,  v7
 289 2:
 290         ld1r            {v0.2s},   [x0], x1
 291         ld1r            {v1.2s},   [x0], x1
 292 .ifnc \txfm1,iwht
 293         srshr           v4.4h,  v4.4h,  #4
 294         srshr           v5.4h,  v5.4h,  #4
 295         srshr           v6.4h,  v6.4h,  #4
 296         srshr           v7.4h,  v7.4h,  #4
 297 .endif
 298         uaddw           v4.8h,  v4.8h,  v0.8b
 299         uaddw           v5.8h,  v5.8h,  v1.8b
 300         ld1r            {v2.2s},   [x0], x1
 301         ld1r            {v3.2s},   [x0], x1
 302         sqxtun          v0.8b,  v4.8h
 303         sqxtun          v1.8b,  v5.8h
 304         sub             x0,  x0,  x1, lsl #2
 305
 306         uaddw           v6.8h,  v6.8h,  v2.8b
 307         uaddw           v7.8h,  v7.8h,  v3.8b
 308         st1             {v0.s}[0],  [x0], x1
 309         sqxtun          v2.8b,  v6.8h
 310         sqxtun          v3.8b,  v7.8h
 311
 312         st1             {v1.s}[0],  [x0], x1
 313         st1             {v2.s}[0],  [x0], x1
 314         st1             {v3.s}[0],  [x0], x1
 315
 316         ret
 317 endfunc
 318 .endm
 319
 320 itxfm_func4x4 idct,  idct
 321 itxfm_func4x4 iadst, idct
 322 itxfm_func4x4 idct,  iadst
 323 itxfm_func4x4 iadst, iadst
 324 itxfm_func4x4 iwht,  iwht
 325
 326
 327 .macro idct8
 328         dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
 329         dmbutterfly     v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
 330         dmbutterfly     v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
 331         dmbutterfly     v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
 332
 333         butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
 334         butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
 335         butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
 336         butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
 337
 338         dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
 339
 340         butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
 341         butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
 342         butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
 343         butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
 344 .endm
 345
 346 .macro iadst8
 347         dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
 348         dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
 349         dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
 350         dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
 351
 352         dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
 353         dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
 354         dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
 355         dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
 356
 357         butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
 358         butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
 359         neg             v23.8h,   v23.8h  // v23 = out[7]
 360
 361         dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
 362         neg             v19.8h,   v19.8h  // v19 = out[3]
 363
 364         dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[1], v0.h[2]   // v26,v27 = t5a, v28,v29 = t4a
 365         dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[2], v0.h[1]   // v2,v3   = t6a, v4,v5   = t7a
 366
 367         dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
 368         dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
 369         neg             v17.8h,   v17.8h  // v17 = out[1]
 370
 371         dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
 372         neg             v21.8h,   v21.8h  // v21 = out[5]
 373 .endm
 374
 375
 376 .macro itxfm_func8x8 txfm1, txfm2
 377 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 378         // The iadst also uses a few coefficients from
 379         // idct, so those always need to be loaded.
 380 .ifc \txfm1\()_\txfm2,idct_idct
 381         movrel          x4,  idct_coeffs
 382         ld1             {v0.8h}, [x4]
 383 .else
 384         movrel          x4, iadst8_coeffs
 385         ld1             {v1.8h}, [x4], #16
 386         ld1             {v0.8h}, [x4]
 387 .endif
 388
 389         movi            v2.16b, #0
 390         movi            v3.16b, #0
 391         movi            v4.16b, #0
 392         movi            v5.16b, #0
 393
 394 .ifc \txfm1\()_\txfm2,idct_idct
 395         cmp             w3,  #1
 396         b.ne            1f
 397         // DC-only for idct/idct
 398         ld1r            {v2.4h},  [x2]
 399         smull           v2.4s,  v2.4h, v0.h[0]
 400         rshrn           v2.4h,  v2.4s, #14
 401         smull           v2.4s,  v2.4h, v0.h[0]
 402         rshrn           v2.4h,  v2.4s, #14
 403         st1             {v3.h}[0],  [x2]
 404         dup             v16.8h,  v2.h[0]
 405         mov             v17.16b, v16.16b
 406         mov             v18.16b, v16.16b
 407         mov             v19.16b, v16.16b
 408         mov             v20.16b, v16.16b
 409         mov             v21.16b, v16.16b
 410         mov             v22.16b, v16.16b
 411         mov             v23.16b, v16.16b
 412         b               2f
 413 .endif
 414 1:
 415         ld1             {v16.16b,v17.16b,v18.16b,v19.16b},  [x2], #64
 416         ld1             {v20.16b,v21.16b,v22.16b,v23.16b},  [x2], #64
 417         sub             x2,  x2,  #128
 418         st1             {v2.16b,v3.16b,v4.16b,v5.16b},  [x2], #64
 419         st1             {v2.16b,v3.16b,v4.16b,v5.16b},  [x2], #64
 420
 421         \txfm1\()8
 422
 423         // Transpose 8x8 with 16 bit elements
 424         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 425
 426         \txfm2\()8
 427 2:
 428         mov             x3,  x0
 429         // Add into the destination
 430         ld1             {v0.8b},  [x0], x1
 431         srshr           v16.8h, v16.8h, #5
 432         ld1             {v1.8b},  [x0], x1
 433         srshr           v17.8h, v17.8h, #5
 434         ld1             {v2.8b},  [x0], x1
 435         srshr           v18.8h, v18.8h, #5
 436         uaddw           v16.8h, v16.8h, v0.8b
 437         ld1             {v3.8b},  [x0], x1
 438         srshr           v19.8h, v19.8h, #5
 439         uaddw           v17.8h, v17.8h, v1.8b
 440         ld1             {v4.8b},  [x0], x1
 441         srshr           v20.8h, v20.8h, #5
 442         uaddw           v18.8h, v18.8h, v2.8b
 443         sqxtun          v0.8b,  v16.8h
 444         ld1             {v5.8b},  [x0], x1
 445         srshr           v21.8h, v21.8h, #5
 446         uaddw           v19.8h, v19.8h, v3.8b
 447         sqxtun          v1.8b,  v17.8h
 448         ld1             {v6.8b},  [x0], x1
 449         srshr           v22.8h, v22.8h, #5
 450         uaddw           v20.8h, v20.8h, v4.8b
 451         sqxtun          v2.8b,  v18.8h
 452         ld1             {v7.8b},  [x0], x1
 453         srshr           v23.8h, v23.8h, #5
 454         uaddw           v21.8h, v21.8h, v5.8b
 455         sqxtun          v3.8b,  v19.8h
 456
 457         st1             {v0.8b},  [x3], x1
 458         uaddw           v22.8h, v22.8h, v6.8b
 459         st1             {v1.8b},  [x3], x1
 460         sqxtun          v4.8b,  v20.8h
 461         st1             {v2.8b},  [x3], x1
 462         uaddw           v23.8h, v23.8h, v7.8b
 463         st1             {v3.8b},  [x3], x1
 464         sqxtun          v5.8b,  v21.8h
 465         st1             {v4.8b},  [x3], x1
 466         sqxtun          v6.8b,  v22.8h
 467         st1             {v5.8b},  [x3], x1
 468         sqxtun          v7.8b,  v23.8h
 469
 470         st1             {v6.8b},  [x3], x1
 471         st1             {v7.8b},  [x3], x1
 472
 473         ret
 474 endfunc
 475 .endm
 476
 477 itxfm_func8x8 idct,  idct
 478 itxfm_func8x8 iadst, idct
 479 itxfm_func8x8 idct,  iadst
 480 itxfm_func8x8 iadst, iadst
 481
 482
 483 function idct16x16_dc_add_neon
 484         movrel          x4, idct_coeffs
 485         ld1             {v0.4h}, [x4]
 486
 487         movi            v1.4h, #0
 488
 489         ld1r            {v2.4h}, [x2]
 490         smull           v2.4s,  v2.4h, v0.h[0]
 491         rshrn           v2.4h,  v2.4s, #14
 492         smull           v2.4s,  v2.4h, v0.h[0]
 493         rshrn           v2.4h,  v2.4s, #14
 494         dup             v2.8h,  v2.h[0]
 495         st1             {v1.h}[0], [x2]
 496
 497         srshr           v2.8h, v2.8h, #6
 498
 499         mov             x4, #16
 500 1:
 501         // Loop to add the constant from v2 into all 16x16 outputs
 502         ld1             {v3.16b},  [x0]
 503         uaddw           v4.8h,  v2.8h,  v3.8b
 504         uaddw2          v5.8h,  v2.8h,  v3.16b
 505         sqxtun          v4.8b,  v4.8h
 506         sqxtun2         v4.16b, v5.8h
 507         st1             {v4.16b},  [x0], x1
 508         subs            x4,  x4,  #1
 509         b.ne            1b
 510
 511         ret
 512 endfunc
 513
 514 .macro idct16_end
 515         butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
 516         butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
 517         butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
 518         butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
 519         butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
 520         butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
 521         butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
 522         butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
 523
 524         dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
 525         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
 526
 527         butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
 528         butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
 529         butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
 530         butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
 531         butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
 532         butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
 533         butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
 534         butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
 535         ret
 536 .endm
 537
 538 function idct16
 539         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
 540         dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
 541         dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
 542         dmbutterfly     v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
 543         dmbutterfly     v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
 544         dmbutterfly     v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
 545         dmbutterfly     v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
 546         dmbutterfly     v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 547
 548         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 549         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 550         butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 551         butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 552         butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 553         butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 554         butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 555         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 556
 557         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 558         dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 559         dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 560         idct16_end
 561 endfunc
 562
 563 function idct16_half
 564         dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
 565         dmbutterfly_h1  v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
 566         dmbutterfly_h1  v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
 567         dmbutterfly_h2  v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
 568         dmbutterfly_h1  v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
 569         dmbutterfly_h2  v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
 570         dmbutterfly_h1  v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
 571         dmbutterfly_h2  v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 572
 573         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 574         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 575         butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 576         butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 577         butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 578         butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 579         butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 580         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 581
 582         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 583         dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 584         dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 585         idct16_end
 586 endfunc
 587
 588 function idct16_quarter
 589         dsmull_h        v24, v25, v19, v1.h[6]
 590         dsmull_h        v4,  v5,  v17, v0.h[7]
 591         dsmull_h        v7,  v6,  v18, v0.h[4]
 592         dsmull_h        v30, v31, v18, v0.h[3]
 593         neg             v24.4s,  v24.4s
 594         neg             v25.4s,  v25.4s
 595         dsmull_h        v29, v28, v17, v1.h[0]
 596         dsmull_h        v26, v27, v19, v1.h[5]
 597         dsmull_h        v22, v23, v16, v0.h[0]
 598         drshrn_h        v24, v24, v25, #14
 599         drshrn_h        v16, v4,  v5,  #14
 600         drshrn_h        v7,  v7,  v6,  #14
 601         drshrn_h        v6,  v30, v31, #14
 602         drshrn_h        v29, v29, v28, #14
 603         drshrn_h        v17, v26, v27, #14
 604         drshrn_h        v28, v22, v23, #14
 605
 606         dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
 607         dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
 608         neg             v22.4s,  v22.4s
 609         neg             v23.4s,  v23.4s
 610         drshrn_h        v27, v20, v21, #14
 611         drshrn_h        v21, v22, v23, #14
 612         drshrn_h        v23, v18, v19, #14
 613         drshrn_h        v25, v30, v31, #14
 614         mov             v4.16b,  v28.16b
 615         mov             v5.16b,  v28.16b
 616         dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
 617         mov             v20.16b, v28.16b
 618         idct16_end
 619 endfunc
 620
 621 function iadst16
 622         ld1             {v0.8h,v1.8h}, [x11]
 623
 624         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
 625         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.h[1], v1.h[0]   // v10,v11 = t9,   v8,v9   = t8
 626         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
 627         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
 628         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
 629
 630         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.h[3], v1.h[2]   // v6,v7   = t11,  v4,v5   = t10
 631         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
 632         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v0.h[5], v0.h[4]   // v10,v11 = t5,   v8,v9   = t4
 633         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
 634
 635         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
 636         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
 637         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v0.h[7], v0.h[6]   // v6,v7   = t7,   v4,v5   = t6
 638         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
 639
 640         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
 641         ld1             {v0.8h}, [x10]
 642         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
 643         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4]   // v14,v15 = t9,   v12,v13 = t8
 644         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
 645
 646         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[4], v0.h[3]   // v4,v5   = t12,  v6,v7   = t13
 647         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
 648         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[5], v0.h[6]   // v10,v11 = t11,  v8,v9   = t10
 649         butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
 650         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
 651
 652         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5]   // v12,v13 = t14,  v14,v15 = t15
 653         butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
 654         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
 655         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
 656
 657         butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
 658         butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
 659
 660         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[1], v0.h[2]   // v10,v11 = t13,  v8,v9   = t12
 661         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1]   // v12,v13 = t14,  v14,v15 = t15
 662
 663         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
 664         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
 665         neg             v29.8h, v29.8h                   // v29 = out[13]
 666
 667         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[1], v0.h[2]   // v10,v11 = t5a,  v8,v9   = t4a
 668         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[2], v0.h[1]   // v12,v13 = t6a,  v14,v15 = t7a
 669
 670         butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
 671         butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
 672
 673         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
 674         neg             v19.8h, v19.8h                   // v19 = out[3]
 675         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
 676
 677         butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
 678         butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
 679
 680         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
 681         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
 682         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
 683         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
 684
 685         neg             v31.8h,  v5.8h                    // v31 = out[15]
 686         neg             v17.8h,  v3.8h                    // v17 = out[1]
 687
 688         mov             v16.16b, v2.16b
 689         mov             v30.16b, v4.16b
 690         ret
 691 endfunc
 692
 693 // Helper macros; we can't use these expressions directly within
 694 // e.g. .irp due to the extra concatenation \(). Therefore wrap
 695 // them in macros to allow using .irp below.
 696 .macro load i, src, inc
 697         ld1             {v\i\().8h},  [\src], \inc
 698 .endm
 699 .macro store i, dst, inc
 700         st1             {v\i\().8h},  [\dst], \inc
 701 .endm
 702 .macro movi_v i, size, imm
 703         movi            v\i\()\size,  \imm
 704 .endm
 705 .macro load_clear i, src, inc
 706         ld1             {v\i\().8h}, [\src]
 707         st1             {v2.8h},  [\src], \inc
 708 .endm
 709
 710 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
 711         srshr           \coef0, \coef0, #6
 712         ld1             {v2.8b},  [x0], x1
 713         srshr           \coef1, \coef1, #6
 714         ld1             {v3.8b},  [x3], x1
 715         srshr           \coef2, \coef2, #6
 716         ld1             {v4.8b},  [x0], x1
 717         srshr           \coef3, \coef3, #6
 718         uaddw           \coef0, \coef0, v2.8b
 719         ld1             {v5.8b},  [x3], x1
 720         uaddw           \coef1, \coef1, v3.8b
 721         srshr           \coef4, \coef4, #6
 722         ld1             {v6.8b},  [x0], x1
 723         srshr           \coef5, \coef5, #6
 724         ld1             {v7.8b},  [x3], x1
 725         sqxtun          v2.8b,  \coef0
 726         srshr           \coef6, \coef6, #6
 727         sqxtun          v3.8b,  \coef1
 728         srshr           \coef7, \coef7, #6
 729         uaddw           \coef2, \coef2, v4.8b
 730         ld1             {\tmp1},  [x0], x1
 731         uaddw           \coef3, \coef3, v5.8b
 732         ld1             {\tmp2},  [x3], x1
 733         sqxtun          v4.8b,  \coef2
 734         sub             x0,  x0,  x1, lsl #2
 735         sub             x3,  x3,  x1, lsl #2
 736         sqxtun          v5.8b,  \coef3
 737         uaddw           \coef4, \coef4, v6.8b
 738         st1             {v2.8b},  [x0], x1
 739         uaddw           \coef5, \coef5, v7.8b
 740         st1             {v3.8b},  [x3], x1
 741         sqxtun          v6.8b,  \coef4
 742         st1             {v4.8b},  [x0], x1
 743         sqxtun          v7.8b,  \coef5
 744         st1             {v5.8b},  [x3], x1
 745         uaddw           \coef6, \coef6, \tmp1
 746         st1             {v6.8b},  [x0], x1
 747         uaddw           \coef7, \coef7, \tmp2
 748         st1             {v7.8b},  [x3], x1
 749         sqxtun          \tmp1,  \coef6
 750         sqxtun          \tmp2,  \coef7
 751         st1             {\tmp1},  [x0], x1
 752         st1             {\tmp2},  [x3], x1
 753 .endm
 754
 755 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 756 // transpose into a horizontal 16x8 slice and store.
 757 // x0 = dst (temp buffer)
 758 // x1 = slice offset
 759 // x2 = src
 760 // x9 = input stride
 761 .macro itxfm16_1d_funcs txfm
 762 function \txfm\()16_1d_8x16_pass1_neon
 763         mov             x14, x30
 764
 765         movi            v2.8h, #0
 766 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 767         load_clear      \i,  x2,  x9
 768 .endr
 769
 770         bl              \txfm\()16
 771
 772         // Do two 8x8 transposes. Originally, v16-v31 contain the
 773         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 774         // transposed 8x8 blocks.
 775         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 776         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 777
 778         // Store the transposed 8x8 blocks horizontally.
 779         cmp             x1,  #8
 780         b.eq            1f
 781 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 782         store           \i,  x0,  #16
 783 .endr
 784         br              x14
 785 1:
 786         // Special case: For the last input column (x1 == 8),
 787         // which would be stored as the last row in the temp buffer,
 788         // don't store the first 8x8 block, but keep it in registers
 789         // for the first slice of the second pass (where it is the
 790         // last 8x8 block).
 791 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 792         add             x0,  x0,  #16
 793         store           \i,  x0,  #16
 794 .endr
 795         mov             v24.16b, v16.16b
 796         mov             v25.16b, v17.16b
 797         mov             v26.16b, v18.16b
 798         mov             v27.16b, v19.16b
 799         mov             v28.16b, v20.16b
 800         mov             v29.16b, v21.16b
 801         mov             v30.16b, v22.16b
 802         mov             v31.16b, v23.16b
 803         br              x14
 804 endfunc
 805
 806 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 807 // load the destination pixels (from a similar 8x16 slice), add and store back.
 808 // x0 = dst
 809 // x1 = dst stride
 810 // x2 = src (temp buffer)
 811 // x3 = slice offset
 812 // x9 = temp buffer stride
 813 function \txfm\()16_1d_8x16_pass2_neon
 814         mov             x14, x30
 815 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 816         load            \i,  x2,  x9
 817 .endr
 818         cbz             x3,  1f
 819 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 820         load            \i,  x2,  x9
 821 .endr
 822 1:
 823
 824         add             x3,  x0,  x1
 825         lsl             x1,  x1,  #1
 826         bl              \txfm\()16
 827
 828         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
 829         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 830
 831         br              x14
 832 endfunc
 833 .endm
 834
 835 itxfm16_1d_funcs idct
 836 itxfm16_1d_funcs iadst
 837
 838 .macro itxfm_func16x16 txfm1, txfm2
 839 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 840 .ifc \txfm1\()_\txfm2,idct_idct
 841         cmp             w3,  #1
 842         b.eq            idct16x16_dc_add_neon
 843 .endif
 844         mov             x15, x30
 845         // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
 846 .ifnc \txfm1\()_\txfm2,idct_idct
 847         stp             d14, d15, [sp, #-0x10]!
 848         stp             d12, d13, [sp, #-0x10]!
 849         stp             d10, d11, [sp, #-0x10]!
 850         stp             d8,  d9,  [sp, #-0x10]!
 851 .endif
 852
 853         sub             sp,  sp,  #512
 854
 855         mov             x4,  x0
 856         mov             x5,  x1
 857         mov             x6,  x2
 858
 859         movrel          x10, idct_coeffs
 860 .ifnc \txfm1\()_\txfm2,idct_idct
 861         movrel          x11, iadst16_coeffs
 862 .endif
 863 .ifc \txfm1,idct
 864         ld1             {v0.8h,v1.8h}, [x10]
 865 .endif
 866         mov             x9, #32
 867
 868 .ifc \txfm1\()_\txfm2,idct_idct
 869         cmp             w3,  #10
 870         b.le            idct16x16_quarter_add_neon
 871         cmp             w3,  #38
 872         b.le            idct16x16_half_add_neon
 873 .endif
 874
 875 .irp i, 0, 8
 876         add             x0,  sp,  #(\i*32)
 877 .ifc \txfm1\()_\txfm2,idct_idct
 878 .if \i == 8
 879         cmp             w3,  #38
 880         b.le            1f
 881 .endif
 882 .endif
 883         mov             x1,  #\i
 884         add             x2,  x6,  #(\i*2)
 885         bl              \txfm1\()16_1d_8x16_pass1_neon
 886 .endr
 887 .ifc \txfm1\()_\txfm2,iadst_idct
 888         ld1             {v0.8h,v1.8h}, [x10]
 889 .endif
 890
 891 .ifc \txfm1\()_\txfm2,idct_idct
 892         b               3f
 893 1:
 894         // Set v24-v31 to zero, for the in-register passthrough of
 895         // coefficients to pass 2. Since we only do two slices, this can
 896         // only ever happen for the second slice. So we only need to store
 897         // zeros to the temp buffer for the second half of the buffer.
 898         // Move x0 to the second half, and use x9 == 32 as increment.
 899         add             x0,  x0,  #16
 900 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 901         movi_v          \i,  .16b, #0
 902         st1             {v24.8h},  [x0], x9
 903 .endr
 904 3:
 905 .endif
 906
 907 .irp i, 0, 8
 908         add             x0,  x4,  #(\i)
 909         mov             x1,  x5
 910         add             x2,  sp,  #(\i*2)
 911         mov             x3,  #\i
 912         bl              \txfm2\()16_1d_8x16_pass2_neon
 913 .endr
 914
 915         add             sp,  sp,  #512
 916 .ifnc \txfm1\()_\txfm2,idct_idct
 917         ldp             d8,  d9,  [sp], 0x10
 918         ldp             d10, d11, [sp], 0x10
 919         ldp             d12, d13, [sp], 0x10
 920         ldp             d14, d15, [sp], 0x10
 921 .endif
 922         br              x15
 923 endfunc
 924 .endm
 925
 926 itxfm_func16x16 idct,  idct
 927 itxfm_func16x16 iadst, idct
 928 itxfm_func16x16 idct,  iadst
 929 itxfm_func16x16 iadst, iadst
 930
 931 function idct16_1d_8x16_pass1_quarter_neon
 932         mov             x14, x30
 933         movi            v2.8h, #0
 934 .irp i, 16, 17, 18, 19
 935         load_clear      \i,  x2,  x9
 936 .endr
 937
 938         bl              idct16_quarter
 939
 940         // Do two 8x8 transposes. Originally, v16-v31 contain the
 941         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 942         // transposed 8x8 blocks.
 943         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 944         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 945
 946         // Store the transposed 8x8 blocks horizontally.
 947         // The first 8x8 block is kept in registers for the second pass,
 948         // store the rest in the temp buffer.
 949         // Since only a 4x4 part of the input was nonzero, this means that
 950         // only 4 rows are nonzero after transposing, and the second pass
 951         // only reads the topmost 4 rows. Therefore only store the topmost
 952         // 4 rows.
 953         add             x0,  x0,  #16
 954 .irp i, 24, 25, 26, 27
 955         store           \i,  x0,  x9
 956 .endr
 957         br              x14
 958 endfunc
 959
 960 function idct16_1d_8x16_pass2_quarter_neon
 961         mov             x14, x30
 962         cbz             x3,  1f
 963 .irp i, 16, 17, 18, 19
 964         load            \i,  x2,  x9
 965 .endr
 966 1:
 967
 968         add             x3,  x0,  x1
 969         lsl             x1,  x1,  #1
 970         bl              idct16_quarter
 971
 972         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
 973         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 974
 975         br              x14
 976 endfunc
 977
 978 function idct16_1d_8x16_pass1_half_neon
 979         mov             x14, x30
 980         movi            v2.8h, #0
 981 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 982         load_clear      \i,  x2,  x9
 983 .endr
 984
 985         bl              idct16_half
 986
 987         // Do two 8x8 transposes. Originally, v16-v31 contain the
 988         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 989         // transposed 8x8 blocks.
 990         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 991         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 992
 993         // Store the transposed 8x8 blocks horizontally.
 994         // The first 8x8 block is kept in registers for the second pass,
 995         // store the rest in the temp buffer.
 996         add             x0,  x0,  #16
 997 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 998         store           \i,  x0,  x9
 999 .endr
1000         br              x14
1001 endfunc
1002
1003 function idct16_1d_8x16_pass2_half_neon
1004         mov             x14, x30
1005         cbz             x3,  1f
1006 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1007         load            \i,  x2,  x9
1008 .endr
1009 1:
1010
1011         add             x3,  x0,  x1
1012         lsl             x1,  x1,  #1
1013         bl              idct16_half
1014
1015         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1016         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1017
1018         br              x14
1019 endfunc
1020
1021 .macro idct16_partial size
1022 function idct16x16_\size\()_add_neon
1023         add             x0,  sp,  #(0*32)
1024         add             x2,  x6,  #(0*2)
1025         bl              idct16_1d_8x16_pass1_\size\()_neon
1026 .irp i, 0, 8
1027         add             x0,  x4,  #(\i)
1028         mov             x1,  x5
1029         add             x2,  sp,  #(\i*2)
1030         mov             x3,  #\i
1031         bl              idct16_1d_8x16_pass2_\size\()_neon
1032 .endr
1033
1034         add             sp,  sp,  #512
1035         br              x15
1036 endfunc
1037 .endm
1038
1039 idct16_partial quarter
1040 idct16_partial half
1041
1042 function idct32x32_dc_add_neon
1043         movrel          x4, idct_coeffs
1044         ld1             {v0.4h}, [x4]
1045
1046         movi            v1.4h, #0
1047
1048         ld1r            {v2.4h}, [x2]
1049         smull           v2.4s,  v2.4h,  v0.h[0]
1050         rshrn           v2.4h,  v2.4s,  #14
1051         smull           v2.4s,  v2.4h,  v0.h[0]
1052         rshrn           v2.4h,  v2.4s,  #14
1053         dup             v2.8h,  v2.h[0]
1054         st1             {v1.h}[0], [x2]
1055
1056         srshr           v0.8h, v2.8h, #6
1057
1058         mov             x4, #32
1059 1:
1060         // Loop to add the constant v0 into all 32x32 outputs
1061         ld1             {v1.16b,v2.16b},  [x0]
1062         uaddw           v3.8h,  v0.8h,  v1.8b
1063         uaddw2          v4.8h,  v0.8h,  v1.16b
1064         uaddw           v5.8h,  v0.8h,  v2.8b
1065         uaddw2          v6.8h,  v0.8h,  v2.16b
1066         sqxtun          v3.8b,  v3.8h
1067         sqxtun2         v3.16b, v4.8h
1068         sqxtun          v4.8b,  v5.8h
1069         sqxtun2         v4.16b, v6.8h
1070         st1             {v3.16b,v4.16b},  [x0], x1
1071         subs            x4,  x4,  #1
1072         b.ne            1b
1073
1074         ret
1075 endfunc
1076
1077 .macro idct32_end
1078         butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1079         butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1080         butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1081         butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1082         butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1083         butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1084         butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
1085         butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1086
1087         dmbutterfly     v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1088         dmbutterfly     v3,  v5,  v0.h[1], v0.h[2], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1089         dmbutterfly     v28, v6,  v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1090         dmbutterfly     v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1091
1092         butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1093         butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1094         butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1095         butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1096         butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1097         butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1098         butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1099         butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
1100
1101         dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
1102         dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1103         dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
1104         dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1105         ret
1106 .endm
1107
1108 function idct32_odd
1109         ld1             {v0.8h,v1.8h}, [x11]
1110
1111         dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1112         dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1113         dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1114         dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1115         dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1116         dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1117         dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1118         dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1119
1120         ld1             {v0.8h}, [x10]
1121
1122         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1123         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1124         butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1125         butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1126         butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1127         butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1128         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1129         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1130
1131         dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1132         dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1133         dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1134         dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1135         idct32_end
1136 endfunc
1137
1138 function idct32_odd_half
1139         ld1             {v0.8h,v1.8h}, [x11]
1140
1141         dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1142         dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1143         dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1144         dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1145         dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1146         dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1147         dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1148         dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1149
1150         ld1             {v0.8h}, [x10]
1151
1152         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1153         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1154         butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1155         butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1156         butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1157         butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1158         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1159         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1160
1161         dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1162         dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1163         dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1164         dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1165         idct32_end
1166 endfunc
1167
1168 function idct32_odd_quarter
1169         ld1             {v0.8h,v1.8h}, [x11]
1170
1171         dsmull_h        v4,  v5,  v16, v0.h[0]
1172         dsmull_h        v28, v29, v19, v0.h[7]
1173         dsmull_h        v30, v31, v16, v0.h[1]
1174         dsmull_h        v22, v23, v17, v1.h[6]
1175         dsmull_h        v7,  v6,  v17, v1.h[7]
1176         dsmull_h        v26, v27, v19, v0.h[6]
1177         dsmull_h        v20, v21, v18, v1.h[0]
1178         dsmull_h        v24, v25, v18, v1.h[1]
1179
1180         ld1             {v0.8h}, [x10]
1181
1182         neg             v28.4s, v28.4s
1183         neg             v29.4s, v29.4s
1184         neg             v7.4s,  v7.4s
1185         neg             v6.4s,  v6.4s
1186
1187         drshrn_h        v4,  v4,  v5,  #14
1188         drshrn_h        v5,  v28, v29, #14
1189         drshrn_h        v29, v30, v31, #14
1190         drshrn_h        v28, v22, v23, #14
1191         drshrn_h        v7,  v7,  v6,  #14
1192         drshrn_h        v31, v26, v27, #14
1193         drshrn_h        v6,  v20, v21, #14
1194         drshrn_h        v30, v24, v25, #14
1195
1196         dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[3], v0.h[4]
1197         dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[3], v0.h[4]
1198         drshrn_h        v23, v16, v17, #14
1199         drshrn_h        v24, v18, v19, #14
1200         neg             v20.4s, v20.4s
1201         neg             v21.4s, v21.4s
1202         drshrn_h        v27, v27, v26, #14
1203         drshrn_h        v20, v20, v21, #14
1204         dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[5], v0.h[6]
1205         drshrn_h        v21, v16, v17, #14
1206         drshrn_h        v26, v18, v19, #14
1207         dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[5], v0.h[6]
1208         drshrn_h        v25, v16, v17, #14
1209         neg             v18.4s, v18.4s
1210         neg             v19.4s, v19.4s
1211         drshrn_h        v22, v18, v19, #14
1212
1213         idct32_end
1214 endfunc
1215
1216 .macro idct32_funcs suffix
1217 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1218 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1219 // a normal IDCT16 with every other input component (the even ones, with
1220 // each output written twice), followed by a separate 16-point IDCT
1221 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1222 // x0 = dst (temp buffer)
1223 // x1 = unused
1224 // x2 = src
1225 // x9 = double input stride
1226 // x10 = idct_coeffs
1227 // x11 = idct_coeffs + 32
1228 function idct32_1d_8x32_pass1\suffix\()_neon
1229         mov             x14, x30
1230         ld1             {v0.8h,v1.8h}, [x10]
1231
1232         movi            v2.8h, #0
1233
1234         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1235 .ifb \suffix
1236 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1237         load_clear      \i, x2, x9
1238 .endr
1239 .endif
1240 .ifc \suffix,_quarter
1241 .irp i, 16, 17, 18, 19
1242         load_clear      \i, x2, x9
1243 .endr
1244 .endif
1245 .ifc \suffix,_half
1246 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1247         load_clear      \i, x2, x9
1248 .endr
1249 .endif
1250
1251         bl              idct16\suffix
1252
1253         // Do two 8x8 transposes. Originally, v16-v31 contain the
1254         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1255         // two transposed 8x8 blocks.
1256         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1257         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1258
1259         // Store the registers a, b horizontally, followed by the
1260         // same registers b, a mirrored.
1261 .macro store_rev a, b
1262         // There's no rev128 instruction, but we reverse each 64 bit
1263         // half, and then flip them using an ext with 8 bytes offset.
1264         rev64           v1.8h, \b
1265         st1             {\a},  [x0], #16
1266         rev64           v0.8h, \a
1267         ext             v1.16b, v1.16b, v1.16b, #8
1268         st1             {\b},  [x0], #16
1269         ext             v0.16b, v0.16b, v0.16b, #8
1270         st1             {v1.8h},  [x0], #16
1271         st1             {v0.8h},  [x0], #16
1272 .endm
1273         store_rev       v16.8h, v24.8h
1274         store_rev       v17.8h, v25.8h
1275         store_rev       v18.8h, v26.8h
1276         store_rev       v19.8h, v27.8h
1277         store_rev       v20.8h, v28.8h
1278         store_rev       v21.8h, v29.8h
1279         store_rev       v22.8h, v30.8h
1280         store_rev       v23.8h, v31.8h
1281         sub             x0,  x0,  #512
1282 .purgem store_rev
1283
1284         // Move x2 back to the start of the input, and move
1285         // to the first odd row
1286 .ifb \suffix
1287         sub             x2,  x2,  x9, lsl #4
1288 .endif
1289 .ifc \suffix,_quarter
1290         sub             x2,  x2,  x9, lsl #2
1291 .endif
1292 .ifc \suffix,_half
1293         sub             x2,  x2,  x9, lsl #3
1294 .endif
1295         add             x2,  x2,  #64
1296
1297         movi            v2.8h, #0
1298         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1299 .ifb \suffix
1300 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1301         load_clear      \i, x2, x9
1302 .endr
1303 .endif
1304 .ifc \suffix,_quarter
1305 .irp i, 16, 17, 18, 19
1306         load_clear      \i, x2, x9
1307 .endr
1308 .endif
1309 .ifc \suffix,_half
1310 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1311         load_clear      \i, x2, x9
1312 .endr
1313 .endif
1314
1315         bl              idct32_odd\suffix
1316
1317         transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1318         transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1319
1320         // Store the registers a, b horizontally,
1321         // adding into the output first, and the mirrored,
1322         // subtracted from the output.
1323 .macro store_rev a, b
1324         ld1             {v4.8h},  [x0]
1325         rev64           v1.8h, \b
1326         add             v4.8h, v4.8h, \a
1327         rev64           v0.8h, \a
1328         st1             {v4.8h},  [x0], #16
1329         ext             v1.16b, v1.16b, v1.16b, #8
1330         ld1             {v5.8h},  [x0]
1331         ext             v0.16b, v0.16b, v0.16b, #8
1332         add             v5.8h, v5.8h, \b
1333         st1             {v5.8h},  [x0], #16
1334         ld1             {v6.8h},  [x0]
1335         sub             v6.8h, v6.8h, v1.8h
1336         st1             {v6.8h},  [x0], #16
1337         ld1             {v7.8h},  [x0]
1338         sub             v7.8h, v7.8h, v0.8h
1339         st1             {v7.8h},  [x0], #16
1340 .endm
1341
1342         store_rev       v31.8h, v23.8h
1343         store_rev       v30.8h, v22.8h
1344         store_rev       v29.8h, v21.8h
1345         store_rev       v28.8h, v20.8h
1346         store_rev       v27.8h, v19.8h
1347         store_rev       v26.8h, v18.8h
1348         store_rev       v25.8h, v17.8h
1349         store_rev       v24.8h, v16.8h
1350 .purgem store_rev
1351         br              x14
1352 endfunc
1353
1354 // This is mostly the same as 8x32_pass1, but without the transpose,
1355 // and use the source as temp buffer between the two idct passes, and
1356 // add into the destination.
1357 // x0 = dst
1358 // x1 = dst stride
1359 // x2 = src (temp buffer)
1360 // x7 = negative double temp buffer stride
1361 // x9 = double temp buffer stride
1362 // x10 = idct_coeffs
1363 // x11 = idct_coeffs + 32
1364 function idct32_1d_8x32_pass2\suffix\()_neon
1365         mov             x14, x30
1366         ld1             {v0.8h,v1.8h}, [x10]
1367
1368         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1369 .ifb \suffix
1370 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1371         load            \i, x2, x9
1372 .endr
1373         sub             x2,  x2,  x9, lsl #4
1374 .endif
1375 .ifc \suffix,_quarter
1376 .irp i, 16, 17, 18, 19
1377         load            \i, x2, x9
1378 .endr
1379         sub             x2,  x2,  x9, lsl #2
1380 .endif
1381 .ifc \suffix,_half
1382 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1383         load            \i, x2, x9
1384 .endr
1385         sub             x2,  x2,  x9, lsl #3
1386 .endif
1387
1388         bl              idct16\suffix
1389
1390 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1391         store           \i, x2, x9
1392 .endr
1393
1394         sub             x2,  x2,  x9, lsl #4
1395         add             x2,  x2,  #64
1396
1397         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1398 .ifb \suffix
1399 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1400         load            \i, x2, x9
1401 .endr
1402         sub             x2,  x2,  x9, lsl #4
1403 .endif
1404 .ifc \suffix,_quarter
1405 .irp i, 16, 17, 18, 19
1406         load            \i, x2, x9
1407 .endr
1408         sub             x2,  x2,  x9, lsl #2
1409 .endif
1410 .ifc \suffix,_half
1411 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1412         load            \i, x2, x9
1413 .endr
1414         sub             x2,  x2,  x9, lsl #3
1415 .endif
1416         sub             x2,  x2,  #64
1417
1418         bl              idct32_odd\suffix
1419
1420 .macro load_acc_store a, b, c, d, neg=0
1421 .if \neg == 0
1422         ld1             {v4.8h},  [x2], x9
1423         ld1             {v5.8h},  [x2], x9
1424         add             v4.8h, v4.8h, \a
1425         ld1             {v6.8h},  [x2], x9
1426         add             v5.8h, v5.8h, \b
1427         ld1             {v7.8h},  [x2], x9
1428         add             v6.8h, v6.8h, \c
1429         add             v7.8h, v7.8h, \d
1430 .else
1431         ld1             {v4.8h},  [x2], x7
1432         ld1             {v5.8h},  [x2], x7
1433         sub             v4.8h, v4.8h, \a
1434         ld1             {v6.8h},  [x2], x7
1435         sub             v5.8h, v5.8h, \b
1436         ld1             {v7.8h},  [x2], x7
1437         sub             v6.8h, v6.8h, \c
1438         sub             v7.8h, v7.8h, \d
1439 .endif
1440         ld1             {v0.8b}, [x0], x1
1441         ld1             {v1.8b}, [x0], x1
1442         srshr           v4.8h, v4.8h, #6
1443         ld1             {v2.8b}, [x0], x1
1444         srshr           v5.8h, v5.8h, #6
1445         uaddw           v4.8h, v4.8h, v0.8b
1446         ld1             {v3.8b}, [x0], x1
1447         srshr           v6.8h, v6.8h, #6
1448         uaddw           v5.8h, v5.8h, v1.8b
1449         srshr           v7.8h, v7.8h, #6
1450         sub             x0,  x0,  x1, lsl #2
1451         uaddw           v6.8h, v6.8h, v2.8b
1452         sqxtun          v4.8b, v4.8h
1453         uaddw           v7.8h, v7.8h, v3.8b
1454         sqxtun          v5.8b, v5.8h
1455         st1             {v4.8b}, [x0], x1
1456         sqxtun          v6.8b, v6.8h
1457         st1             {v5.8b}, [x0], x1
1458         sqxtun          v7.8b, v7.8h
1459         st1             {v6.8b}, [x0], x1
1460         st1             {v7.8b}, [x0], x1
1461 .endm
1462         load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
1463         load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
1464         load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
1465         load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
1466         sub             x2,  x2,  x9
1467         load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
1468         load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
1469         load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
1470         load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
1471 .purgem load_acc_store
1472         br              x14
1473 endfunc
1474 .endm
1475
1476 idct32_funcs
1477 idct32_funcs _quarter
1478 idct32_funcs _half
1479
1480 const min_eob_idct_idct_32, align=4
1481         .short  0, 34, 135, 336
1482 endconst
1483
1484 function ff_vp9_idct_idct_32x32_add_neon, export=1
1485         cmp             w3,  #1
1486         b.eq            idct32x32_dc_add_neon
1487
1488         movrel          x10, idct_coeffs
1489         add             x11, x10, #32
1490         movrel          x12, min_eob_idct_idct_32, 2
1491
1492         mov             x15, x30
1493
1494         stp             d14, d15, [sp, #-0x10]!
1495         stp             d12, d13, [sp, #-0x10]!
1496         stp             d10, d11, [sp, #-0x10]!
1497         stp             d8,  d9,  [sp, #-0x10]!
1498
1499         sub             sp,  sp,  #2048
1500
1501         mov             x4,  x0
1502         mov             x5,  x1
1503         mov             x6,  x2
1504
1505         // Double stride of the input, since we only read every other line
1506         mov             x9,  #128
1507         neg             x7,  x9
1508
1509         cmp             w3,  #34
1510         b.le            idct32x32_quarter_add_neon
1511         cmp             w3,  #135
1512         b.le            idct32x32_half_add_neon
1513
1514 .irp i, 0, 8, 16, 24
1515         add             x0,  sp,  #(\i*64)
1516 .if \i > 0
1517         ldrh            w1,  [x12], #2
1518         cmp             w3,  w1
1519         mov             x1,  #(32 - \i)/4
1520         b.le            1f
1521 .endif
1522         add             x2,  x6,  #(\i*2)
1523         bl              idct32_1d_8x32_pass1_neon
1524 .endr
1525         b               3f
1526
1527 1:
1528         // Write zeros to the temp buffer for pass 2
1529         movi            v16.8h,  #0
1530         movi            v17.8h,  #0
1531         movi            v18.8h,  #0
1532         movi            v19.8h,  #0
1533 2:
1534         subs            x1,  x1,  #1
1535 .rept 4
1536         st1             {v16.8h-v19.8h},  [x0], #64
1537 .endr
1538         b.ne            2b
1539 3:
1540 .irp i, 0, 8, 16, 24
1541         add             x0,  x4,  #(\i)
1542         mov             x1,  x5
1543         add             x2,  sp,  #(\i*2)
1544         bl              idct32_1d_8x32_pass2_neon
1545 .endr
1546
1547         add             sp,  sp,  #2048
1548
1549         ldp             d8,  d9,  [sp], 0x10
1550         ldp             d10, d11, [sp], 0x10
1551         ldp             d12, d13, [sp], 0x10
1552         ldp             d14, d15, [sp], 0x10
1553
1554         br              x15
1555 endfunc
1556
1557 .macro idct32_partial size
1558 function idct32x32_\size\()_add_neon
1559         add             x0,  sp,  #(0*64)
1560         add             x2,  x6,  #(0*2)
1561         bl              idct32_1d_8x32_pass1_\size\()_neon
1562 .ifc \size,half
1563         add             x0,  sp,  #(8*64)
1564         add             x2,  x6,  #(8*2)
1565         bl              idct32_1d_8x32_pass1_\size\()_neon
1566 .endif
1567 .irp i, 0, 8, 16, 24
1568         add             x0,  x4,  #(\i)
1569         mov             x1,  x5
1570         add             x2,  sp,  #(\i*2)
1571         bl              idct32_1d_8x32_pass2_\size\()_neon
1572 .endr
1573
1574         add             sp,  sp,  #2048
1575
1576         ldp             d8,  d9,  [sp], 0x10
1577         ldp             d10, d11, [sp], 0x10
1578         ldp             d12, d13, [sp], 0x10
1579         ldp             d14, d15, [sp], 0x10
1580
1581         br              x15
1582 endfunc
1583 .endm
1584
1585 idct32_partial quarter
1586 idct32_partial half