git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22 #include "neon.S"
  23
  24 const itxfm4_coeffs, align=4
  25         .short  11585, 0, 6270, 15137
  26 iadst4_coeffs:
  27         .short  5283, 15212, 9929, 13377
  28 endconst
  29
  30 const iadst8_coeffs, align=4
  31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
  32 idct_coeffs:
  33         .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
  34         .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
  35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
  36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
  37 endconst
  38
  39 const iadst16_coeffs, align=4
  40         .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
  41         .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
  42 endconst
  43
  44 // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
  45 // out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
  46 // in/out are .8h registers; this can do with 4 temp registers, but is
  47 // more efficient if 6 temp registers are available.
  48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
  49 .if \neg > 0
  50         neg             \tmp4\().4h, v0.4h
  51 .endif
  52         add             \tmp1\().8h, \in1\().8h,  \in2\().8h
  53         sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
  54 .if \neg > 0
  55         smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
  56         smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
  57 .else
  58         smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
  59         smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
  60 .endif
  61 .ifb \tmp5
  62         rshrn           \out1\().4h, \tmp3\().4s, #14
  63         rshrn2          \out1\().8h, \tmp4\().4s, #14
  64         smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
  65         smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
  66         rshrn           \out2\().4h, \tmp3\().4s, #14
  67         rshrn2          \out2\().8h, \tmp4\().4s, #14
  68 .else
  69         smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
  70         smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
  71         rshrn           \out1\().4h, \tmp3\().4s, #14
  72         rshrn2          \out1\().8h, \tmp4\().4s, #14
  73         rshrn           \out2\().4h, \tmp5\().4s, #14
  74         rshrn2          \out2\().8h, \tmp6\().4s, #14
  75 .endif
  76 .endm
  77
  78 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
  79 // writing the same output into both out1 and out2.
  80 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
  81         smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
  82         smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
  83         rshrn           \out1\().4h,  \tmp1\().4s, #14
  84         rshrn2          \out1\().8h,  \tmp2\().4s, #14
  85         rshrn           \out2\().4h,  \tmp1\().4s, #14
  86         rshrn2          \out2\().8h,  \tmp2\().4s, #14
  87 .endm
  88
  89 // out1,out2 = in1 * coef1 - in2 * coef2
  90 // out3,out4 = in1 * coef2 + in2 * coef1
  91 // out are 4 x .4s registers, in are 2 x .8h registers
  92 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
  93         smull           \out1\().4s, \in1\().4h, \coef1
  94         smull2          \out2\().4s, \in1\().8h, \coef1
  95         smull           \out3\().4s, \in1\().4h, \coef2
  96         smull2          \out4\().4s, \in1\().8h, \coef2
  97         smlsl           \out1\().4s, \in2\().4h, \coef2
  98         smlsl2          \out2\().4s, \in2\().8h, \coef2
  99         smlal           \out3\().4s, \in2\().4h, \coef1
 100         smlal2          \out4\().4s, \in2\().8h, \coef1
 101 .endm
 102
 103 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
 104 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
 105 // inout are 2 x .8h registers
 106 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
 107         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
 108 .if \neg > 0
 109         neg             \tmp3\().4s, \tmp3\().4s
 110         neg             \tmp4\().4s, \tmp4\().4s
 111 .endif
 112         rshrn           \inout1\().4h, \tmp1\().4s,  #14
 113         rshrn2          \inout1\().8h, \tmp2\().4s,  #14
 114         rshrn           \inout2\().4h, \tmp3\().4s,  #14
 115         rshrn2          \inout2\().8h, \tmp4\().4s,  #14
 116 .endm
 117
 118 // Same as dmbutterfly above, but treating the input in inout2 as zero
 119 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
 120         smull           \tmp1\().4s, \inout1\().4h, \coef1
 121         smull2          \tmp2\().4s, \inout1\().8h, \coef1
 122         smull           \tmp3\().4s, \inout1\().4h, \coef2
 123         smull2          \tmp4\().4s, \inout1\().8h, \coef2
 124         rshrn           \inout1\().4h, \tmp1\().4s, #14
 125         rshrn2          \inout1\().8h, \tmp2\().4s, #14
 126         rshrn           \inout2\().4h, \tmp3\().4s, #14
 127         rshrn2          \inout2\().8h, \tmp4\().4s, #14
 128 .endm
 129
 130 // Same as dmbutterfly above, but treating the input in inout1 as zero
 131 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
 132         smull           \tmp1\().4s, \inout2\().4h, \coef2
 133         smull2          \tmp2\().4s, \inout2\().8h, \coef2
 134         smull           \tmp3\().4s, \inout2\().4h, \coef1
 135         smull2          \tmp4\().4s, \inout2\().8h, \coef1
 136         neg             \tmp1\().4s, \tmp1\().4s
 137         neg             \tmp2\().4s, \tmp2\().4s
 138         rshrn           \inout2\().4h, \tmp3\().4s, #14
 139         rshrn2          \inout2\().8h, \tmp4\().4s, #14
 140         rshrn           \inout1\().4h, \tmp1\().4s, #14
 141         rshrn2          \inout1\().8h, \tmp2\().4s, #14
 142 .endm
 143
 144 .macro dsmull_h out1, out2, in, coef
 145         smull           \out1\().4s, \in\().4h, \coef
 146         smull2          \out2\().4s, \in\().8h, \coef
 147 .endm
 148
 149 .macro drshrn_h out, in1, in2, shift
 150         rshrn           \out\().4h, \in1\().4s, \shift
 151         rshrn2          \out\().8h, \in2\().4s, \shift
 152 .endm
 153
 154
 155 // out1 = in1 + in2
 156 // out2 = in1 - in2
 157 .macro butterfly_8h out1, out2, in1, in2
 158         add             \out1\().8h, \in1\().8h, \in2\().8h
 159         sub             \out2\().8h, \in1\().8h, \in2\().8h
 160 .endm
 161
 162 // out1 = in1 - in2
 163 // out2 = in1 + in2
 164 .macro butterfly_8h_r out1, out2, in1, in2
 165         sub             \out1\().8h, \in1\().8h, \in2\().8h
 166         add             \out2\().8h, \in1\().8h, \in2\().8h
 167 .endm
 168
 169 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
 170 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
 171 // out are 2 x .8h registers, in are 4 x .4s registers
 172 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
 173         add             \tmp1\().4s, \in1\().4s, \in3\().4s
 174         add             \tmp2\().4s, \in2\().4s, \in4\().4s
 175         sub             \tmp3\().4s, \in1\().4s, \in3\().4s
 176         sub             \tmp4\().4s, \in2\().4s, \in4\().4s
 177         rshrn           \out1\().4h, \tmp1\().4s,  #14
 178         rshrn2          \out1\().8h, \tmp2\().4s,  #14
 179         rshrn           \out2\().4h, \tmp3\().4s,  #14
 180         rshrn2          \out2\().8h, \tmp4\().4s,  #14
 181 .endm
 182
 183 .macro iwht4 c0, c1, c2, c3
 184         add             \c0\().4h, \c0\().4h, \c1\().4h
 185         sub             v17.4h,    \c2\().4h, \c3\().4h
 186         sub             v16.4h,    \c0\().4h, v17.4h
 187         sshr            v16.4h,    v16.4h,    #1
 188         sub             \c2\().4h, v16.4h,    \c1\().4h
 189         sub             \c1\().4h, v16.4h,    \c3\().4h
 190         add             \c3\().4h, v17.4h,    \c2\().4h
 191         sub             \c0\().4h, \c0\().4h, \c1\().4h
 192 .endm
 193
 194 .macro idct4 c0, c1, c2, c3
 195         smull           v22.4s,    \c1\().4h, v0.h[3]
 196         smull           v20.4s,    \c1\().4h, v0.h[2]
 197         add             v16.4h,    \c0\().4h, \c2\().4h
 198         sub             v17.4h,    \c0\().4h, \c2\().4h
 199         smlal           v22.4s,    \c3\().4h, v0.h[2]
 200         smull           v18.4s,    v16.4h,    v0.h[0]
 201         smull           v19.4s,    v17.4h,    v0.h[0]
 202         smlsl           v20.4s,    \c3\().4h, v0.h[3]
 203         rshrn           v22.4h,    v22.4s,    #14
 204         rshrn           v18.4h,    v18.4s,    #14
 205         rshrn           v19.4h,    v19.4s,    #14
 206         rshrn           v20.4h,    v20.4s,    #14
 207         add             \c0\().4h, v18.4h,    v22.4h
 208         sub             \c3\().4h, v18.4h,    v22.4h
 209         add             \c1\().4h, v19.4h,    v20.4h
 210         sub             \c2\().4h, v19.4h,    v20.4h
 211 .endm
 212
 213 .macro iadst4 c0, c1, c2, c3
 214         smull           v16.4s,    \c0\().4h, v0.h[4]
 215         smlal           v16.4s,    \c2\().4h, v0.h[5]
 216         smlal           v16.4s,    \c3\().4h, v0.h[6]
 217         smull           v17.4s,    \c0\().4h, v0.h[6]
 218         smlsl           v17.4s,    \c2\().4h, v0.h[4]
 219         sub             \c0\().4h, \c0\().4h, \c2\().4h
 220         smlsl           v17.4s,    \c3\().4h, v0.h[5]
 221         add             \c0\().4h, \c0\().4h, \c3\().4h
 222         smull           v19.4s,    \c1\().4h, v0.h[7]
 223         smull           v18.4s,    \c0\().4h, v0.h[7]
 224         add             v20.4s,    v16.4s,    v19.4s
 225         add             v21.4s,    v17.4s,    v19.4s
 226         rshrn           \c0\().4h, v20.4s,    #14
 227         add             v16.4s,    v16.4s,    v17.4s
 228         rshrn           \c1\().4h, v21.4s,    #14
 229         sub             v16.4s,    v16.4s,    v19.4s
 230         rshrn           \c2\().4h, v18.4s,    #14
 231         rshrn           \c3\().4h, v16.4s,    #14
 232 .endm
 233
 234 // The public functions in this file have got the following signature:
 235 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 236
 237 .macro itxfm_func4x4 txfm1, txfm2
 238 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 239 .ifc \txfm1,\txfm2
 240 .ifc \txfm1,idct
 241         movrel          x4,  itxfm4_coeffs
 242         ld1             {v0.4h}, [x4]
 243 .endif
 244 .ifc \txfm1,iadst
 245         movrel          x4,  iadst4_coeffs
 246         ld1             {v0.d}[1], [x4]
 247 .endif
 248 .else
 249         movrel          x4,  itxfm4_coeffs
 250         ld1             {v0.8h}, [x4]
 251 .endif
 252
 253         movi            v31.8h, #0
 254 .ifc \txfm1\()_\txfm2,idct_idct
 255         cmp             w3,  #1
 256         b.ne            1f
 257         // DC-only for idct/idct
 258         ld1             {v2.h}[0], [x2]
 259         smull           v2.4s,  v2.4h, v0.h[0]
 260         rshrn           v2.4h,  v2.4s, #14
 261         smull           v2.4s,  v2.4h, v0.h[0]
 262         rshrn           v2.4h,  v2.4s, #14
 263         st1             {v31.h}[0], [x2]
 264         dup             v4.4h,  v2.h[0]
 265         mov             v5.16b, v4.16b
 266         mov             v6.16b, v4.16b
 267         mov             v7.16b, v4.16b
 268         b               2f
 269 .endif
 270
 271 1:
 272         ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
 273         st1             {v31.8h}, [x2], #16
 274
 275 .ifc \txfm1,iwht
 276         sshr            v4.4h,  v4.4h,  #2
 277         sshr            v5.4h,  v5.4h,  #2
 278         sshr            v6.4h,  v6.4h,  #2
 279         sshr            v7.4h,  v7.4h,  #2
 280 .endif
 281
 282         \txfm1\()4      v4,  v5,  v6,  v7
 283
 284         st1             {v31.8h}, [x2], #16
 285         // Transpose 4x4 with 16 bit elements
 286         transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 287
 288         \txfm2\()4      v4,  v5,  v6,  v7
 289 2:
 290         ld1             {v0.s}[0],   [x0], x1
 291         ld1             {v1.s}[0],   [x0], x1
 292 .ifnc \txfm1,iwht
 293         srshr           v4.4h,  v4.4h,  #4
 294         srshr           v5.4h,  v5.4h,  #4
 295         srshr           v6.4h,  v6.4h,  #4
 296         srshr           v7.4h,  v7.4h,  #4
 297 .endif
 298         uaddw           v4.8h,  v4.8h,  v0.8b
 299         uaddw           v5.8h,  v5.8h,  v1.8b
 300         ld1             {v2.s}[0],   [x0], x1
 301         ld1             {v3.s}[0],   [x0], x1
 302         sqxtun          v0.8b,  v4.8h
 303         sqxtun          v1.8b,  v5.8h
 304         sub             x0,  x0,  x1, lsl #2
 305
 306         uaddw           v6.8h,  v6.8h,  v2.8b
 307         uaddw           v7.8h,  v7.8h,  v3.8b
 308         st1             {v0.s}[0],  [x0], x1
 309         sqxtun          v2.8b,  v6.8h
 310         sqxtun          v3.8b,  v7.8h
 311
 312         st1             {v1.s}[0],  [x0], x1
 313         st1             {v2.s}[0],  [x0], x1
 314         st1             {v3.s}[0],  [x0], x1
 315
 316         ret
 317 endfunc
 318 .endm
 319
 320 itxfm_func4x4 idct,  idct
 321 itxfm_func4x4 iadst, idct
 322 itxfm_func4x4 idct,  iadst
 323 itxfm_func4x4 iadst, iadst
 324 itxfm_func4x4 iwht,  iwht
 325
 326
 327 .macro idct8
 328         dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
 329         dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
 330         dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
 331         dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
 332
 333         butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
 334         butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
 335         butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
 336         butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
 337
 338         dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
 339
 340         butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
 341         butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
 342         butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
 343         butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
 344 .endm
 345
 346 .macro iadst8
 347         dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
 348         dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
 349         dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
 350         dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
 351
 352         dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
 353         dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
 354         dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
 355         dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
 356
 357         butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
 358         butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
 359         neg             v23.8h,   v23.8h  // v23 = out[7]
 360
 361         dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
 362         neg             v19.8h,   v19.8h  // v19 = out[3]
 363
 364         dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
 365         dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
 366
 367         dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
 368         dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
 369         neg             v17.8h,   v17.8h  // v17 = out[1]
 370
 371         dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
 372         neg             v21.8h,   v21.8h  // v21 = out[5]
 373 .endm
 374
 375
 376 .macro itxfm_func8x8 txfm1, txfm2
 377 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 378         // The iadst also uses a few coefficients from
 379         // idct, so those always need to be loaded.
 380 .ifc \txfm1\()_\txfm2,idct_idct
 381         movrel          x4,  idct_coeffs
 382 .else
 383         movrel          x4,  iadst8_coeffs
 384         ld1             {v1.8h}, [x4], #16
 385 .endif
 386         ld1             {v0.8h}, [x4]
 387
 388         movi            v2.8h, #0
 389         movi            v3.8h, #0
 390         movi            v4.8h, #0
 391         movi            v5.8h, #0
 392
 393 .ifc \txfm1\()_\txfm2,idct_idct
 394         cmp             w3,  #1
 395         b.ne            1f
 396         // DC-only for idct/idct
 397         ld1             {v2.h}[0],  [x2]
 398         smull           v2.4s,  v2.4h, v0.h[0]
 399         rshrn           v2.4h,  v2.4s, #14
 400         smull           v2.4s,  v2.4h, v0.h[0]
 401         rshrn           v2.4h,  v2.4s, #14
 402         st1             {v3.h}[0],  [x2]
 403         dup             v16.8h,  v2.h[0]
 404         mov             v17.16b, v16.16b
 405         mov             v18.16b, v16.16b
 406         mov             v19.16b, v16.16b
 407         mov             v20.16b, v16.16b
 408         mov             v21.16b, v16.16b
 409         mov             v22.16b, v16.16b
 410         mov             v23.16b, v16.16b
 411         b               2f
 412 .endif
 413 1:
 414         ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
 415         ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
 416         sub             x2,  x2,  #128
 417         st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
 418         st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
 419
 420         \txfm1\()8
 421
 422         // Transpose 8x8 with 16 bit elements
 423         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 424
 425         \txfm2\()8
 426 2:
 427         mov             x3,  x0
 428         // Add into the destination
 429         ld1             {v0.8b},  [x0], x1
 430         srshr           v16.8h, v16.8h, #5
 431         ld1             {v1.8b},  [x0], x1
 432         srshr           v17.8h, v17.8h, #5
 433         ld1             {v2.8b},  [x0], x1
 434         srshr           v18.8h, v18.8h, #5
 435         uaddw           v16.8h, v16.8h, v0.8b
 436         ld1             {v3.8b},  [x0], x1
 437         srshr           v19.8h, v19.8h, #5
 438         uaddw           v17.8h, v17.8h, v1.8b
 439         ld1             {v4.8b},  [x0], x1
 440         srshr           v20.8h, v20.8h, #5
 441         uaddw           v18.8h, v18.8h, v2.8b
 442         sqxtun          v0.8b,  v16.8h
 443         ld1             {v5.8b},  [x0], x1
 444         srshr           v21.8h, v21.8h, #5
 445         uaddw           v19.8h, v19.8h, v3.8b
 446         sqxtun          v1.8b,  v17.8h
 447         ld1             {v6.8b},  [x0], x1
 448         srshr           v22.8h, v22.8h, #5
 449         uaddw           v20.8h, v20.8h, v4.8b
 450         sqxtun          v2.8b,  v18.8h
 451         ld1             {v7.8b},  [x0], x1
 452         srshr           v23.8h, v23.8h, #5
 453         uaddw           v21.8h, v21.8h, v5.8b
 454         sqxtun          v3.8b,  v19.8h
 455
 456         st1             {v0.8b},  [x3], x1
 457         uaddw           v22.8h, v22.8h, v6.8b
 458         st1             {v1.8b},  [x3], x1
 459         sqxtun          v4.8b,  v20.8h
 460         st1             {v2.8b},  [x3], x1
 461         uaddw           v23.8h, v23.8h, v7.8b
 462         st1             {v3.8b},  [x3], x1
 463         sqxtun          v5.8b,  v21.8h
 464         st1             {v4.8b},  [x3], x1
 465         sqxtun          v6.8b,  v22.8h
 466         st1             {v5.8b},  [x3], x1
 467         sqxtun          v7.8b,  v23.8h
 468
 469         st1             {v6.8b},  [x3], x1
 470         st1             {v7.8b},  [x3], x1
 471
 472         ret
 473 endfunc
 474 .endm
 475
 476 itxfm_func8x8 idct,  idct
 477 itxfm_func8x8 iadst, idct
 478 itxfm_func8x8 idct,  iadst
 479 itxfm_func8x8 iadst, iadst
 480
 481
 482 function idct16x16_dc_add_neon
 483         movrel          x4,  idct_coeffs
 484         ld1             {v0.4h}, [x4]
 485
 486         movi            v1.4h,  #0
 487
 488         ld1             {v2.h}[0], [x2]
 489         smull           v2.4s,  v2.4h,  v0.h[0]
 490         rshrn           v2.4h,  v2.4s,  #14
 491         smull           v2.4s,  v2.4h,  v0.h[0]
 492         rshrn           v2.4h,  v2.4s,  #14
 493         dup             v2.8h,  v2.h[0]
 494         st1             {v1.h}[0], [x2]
 495
 496         srshr           v2.8h,  v2.8h,  #6
 497
 498         mov             x3,  x0
 499         mov             x4,  #16
 500 1:
 501         // Loop to add the constant from v2 into all 16x16 outputs
 502         subs            x4,  x4,  #2
 503         ld1             {v3.16b},  [x0], x1
 504         ld1             {v4.16b},  [x0], x1
 505         uaddw           v16.8h, v2.8h,  v3.8b
 506         uaddw2          v17.8h, v2.8h,  v3.16b
 507         uaddw           v18.8h, v2.8h,  v4.8b
 508         uaddw2          v19.8h, v2.8h,  v4.16b
 509         sqxtun          v3.8b,  v16.8h
 510         sqxtun2         v3.16b, v17.8h
 511         sqxtun          v4.8b,  v18.8h
 512         sqxtun2         v4.16b, v19.8h
 513         st1             {v3.16b},  [x3], x1
 514         st1             {v4.16b},  [x3], x1
 515         b.ne            1b
 516
 517         ret
 518 endfunc
 519
 520 .macro idct16_end
 521         butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
 522         butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
 523         butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
 524         butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
 525         butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
 526         butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
 527         butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
 528         butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
 529
 530         dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
 531         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
 532
 533         butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
 534         butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
 535         butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
 536         butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
 537         butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
 538         butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
 539         butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
 540         butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
 541         ret
 542 .endm
 543
 544 function idct16
 545         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
 546         dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
 547         dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
 548         dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
 549         dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
 550         dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
 551         dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
 552         dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 553
 554         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 555         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 556         butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 557         butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 558         butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 559         butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 560         butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 561         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 562
 563         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 564         dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 565         dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 566         idct16_end
 567 endfunc
 568
 569 function idct16_half
 570         dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
 571         dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
 572         dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
 573         dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
 574         dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
 575         dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
 576         dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
 577         dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 578
 579         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 580         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 581         butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 582         butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 583         butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 584         butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 585         butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 586         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 587
 588         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 589         dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 590         dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 591         idct16_end
 592 endfunc
 593
 594 function idct16_quarter
 595         dsmull_h        v24, v25, v19, v1.h[7]
 596         dsmull_h        v4,  v5,  v17, v1.h[0]
 597         dsmull_h        v7,  v6,  v18, v0.h[5]
 598         dsmull_h        v30, v31, v18, v0.h[4]
 599         neg             v24.4s,  v24.4s
 600         neg             v25.4s,  v25.4s
 601         dsmull_h        v29, v28, v17, v1.h[1]
 602         dsmull_h        v26, v27, v19, v1.h[6]
 603         dsmull_h        v22, v23, v16, v0.h[0]
 604         drshrn_h        v24, v24, v25, #14
 605         drshrn_h        v16, v4,  v5,  #14
 606         drshrn_h        v7,  v7,  v6,  #14
 607         drshrn_h        v6,  v30, v31, #14
 608         drshrn_h        v29, v29, v28, #14
 609         drshrn_h        v17, v26, v27, #14
 610         drshrn_h        v28, v22, v23, #14
 611
 612         dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
 613         dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
 614         neg             v22.4s,  v22.4s
 615         neg             v23.4s,  v23.4s
 616         drshrn_h        v27, v20, v21, #14
 617         drshrn_h        v21, v22, v23, #14
 618         drshrn_h        v23, v18, v19, #14
 619         drshrn_h        v25, v30, v31, #14
 620         mov             v4.16b,  v28.16b
 621         mov             v5.16b,  v28.16b
 622         dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
 623         mov             v20.16b, v28.16b
 624         idct16_end
 625 endfunc
 626
 627 function iadst16
 628         ld1             {v0.8h,v1.8h}, [x11]
 629
 630         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
 631         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
 632         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
 633         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
 634         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
 635
 636         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
 637         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
 638         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
 639         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
 640
 641         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
 642         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
 643         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
 644         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
 645
 646         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
 647         ld1             {v0.8h}, [x10]
 648         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
 649         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
 650         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
 651
 652         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
 653         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
 654         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
 655         butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
 656         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
 657
 658         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
 659         butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
 660         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
 661         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
 662
 663         butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
 664         butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
 665
 666         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
 667         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
 668
 669         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
 670         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
 671         neg             v29.8h, v29.8h                   // v29 = out[13]
 672
 673         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
 674         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
 675
 676         butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
 677         butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
 678
 679         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
 680         neg             v19.8h, v19.8h                   // v19 = out[3]
 681         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
 682
 683         butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
 684         butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
 685
 686         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
 687         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
 688         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
 689         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
 690
 691         neg             v31.8h,  v5.8h                    // v31 = out[15]
 692         neg             v17.8h,  v3.8h                    // v17 = out[1]
 693
 694         mov             v16.16b, v2.16b
 695         mov             v30.16b, v4.16b
 696         ret
 697 endfunc
 698
 699 // Helper macros; we can't use these expressions directly within
 700 // e.g. .irp due to the extra concatenation \(). Therefore wrap
 701 // them in macros to allow using .irp below.
 702 .macro load i, src, inc
 703         ld1             {v\i\().8h},  [\src], \inc
 704 .endm
 705 .macro store i, dst, inc
 706         st1             {v\i\().8h},  [\dst], \inc
 707 .endm
 708 .macro movi_v i, size, imm
 709         movi            v\i\()\size,  \imm
 710 .endm
 711 .macro load_clear i, src, inc
 712         ld1             {v\i\().8h}, [\src]
 713         st1             {v2.8h},  [\src], \inc
 714 .endm
 715
 716 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
 717         srshr           \coef0, \coef0, #6
 718         ld1             {v2.8b},  [x0], x1
 719         srshr           \coef1, \coef1, #6
 720         ld1             {v3.8b},  [x3], x1
 721         srshr           \coef2, \coef2, #6
 722         ld1             {v4.8b},  [x0], x1
 723         srshr           \coef3, \coef3, #6
 724         uaddw           \coef0, \coef0, v2.8b
 725         ld1             {v5.8b},  [x3], x1
 726         uaddw           \coef1, \coef1, v3.8b
 727         srshr           \coef4, \coef4, #6
 728         ld1             {v6.8b},  [x0], x1
 729         srshr           \coef5, \coef5, #6
 730         ld1             {v7.8b},  [x3], x1
 731         sqxtun          v2.8b,  \coef0
 732         srshr           \coef6, \coef6, #6
 733         sqxtun          v3.8b,  \coef1
 734         srshr           \coef7, \coef7, #6
 735         uaddw           \coef2, \coef2, v4.8b
 736         ld1             {\tmp1},  [x0], x1
 737         uaddw           \coef3, \coef3, v5.8b
 738         ld1             {\tmp2},  [x3], x1
 739         sqxtun          v4.8b,  \coef2
 740         sub             x0,  x0,  x1, lsl #2
 741         sub             x3,  x3,  x1, lsl #2
 742         sqxtun          v5.8b,  \coef3
 743         uaddw           \coef4, \coef4, v6.8b
 744         st1             {v2.8b},  [x0], x1
 745         uaddw           \coef5, \coef5, v7.8b
 746         st1             {v3.8b},  [x3], x1
 747         sqxtun          v6.8b,  \coef4
 748         st1             {v4.8b},  [x0], x1
 749         sqxtun          v7.8b,  \coef5
 750         st1             {v5.8b},  [x3], x1
 751         uaddw           \coef6, \coef6, \tmp1
 752         st1             {v6.8b},  [x0], x1
 753         uaddw           \coef7, \coef7, \tmp2
 754         st1             {v7.8b},  [x3], x1
 755         sqxtun          \tmp1,  \coef6
 756         sqxtun          \tmp2,  \coef7
 757         st1             {\tmp1},  [x0], x1
 758         st1             {\tmp2},  [x3], x1
 759 .endm
 760
 761 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 762 // transpose into a horizontal 16x8 slice and store.
 763 // x0 = dst (temp buffer)
 764 // x1 = slice offset
 765 // x2 = src
 766 // x9 = input stride
 767 .macro itxfm16_1d_funcs txfm
 768 function \txfm\()16_1d_8x16_pass1_neon
 769         mov             x14, x30
 770
 771         movi            v2.8h, #0
 772 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 773         load_clear      \i,  x2,  x9
 774 .endr
 775
 776         bl              \txfm\()16
 777
 778         // Do two 8x8 transposes. Originally, v16-v31 contain the
 779         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 780         // transposed 8x8 blocks.
 781         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 782         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 783
 784         // Store the transposed 8x8 blocks horizontally.
 785         cmp             x1,  #8
 786         b.eq            1f
 787 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 788         store           \i,  x0,  #16
 789 .endr
 790         br              x14
 791 1:
 792         // Special case: For the last input column (x1 == 8),
 793         // which would be stored as the last row in the temp buffer,
 794         // don't store the first 8x8 block, but keep it in registers
 795         // for the first slice of the second pass (where it is the
 796         // last 8x8 block).
 797 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 798         add             x0,  x0,  #16
 799         store           \i,  x0,  #16
 800 .endr
 801         mov             v24.16b, v16.16b
 802         mov             v25.16b, v17.16b
 803         mov             v26.16b, v18.16b
 804         mov             v27.16b, v19.16b
 805         mov             v28.16b, v20.16b
 806         mov             v29.16b, v21.16b
 807         mov             v30.16b, v22.16b
 808         mov             v31.16b, v23.16b
 809         br              x14
 810 endfunc
 811
 812 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 813 // load the destination pixels (from a similar 8x16 slice), add and store back.
 814 // x0 = dst
 815 // x1 = dst stride
 816 // x2 = src (temp buffer)
 817 // x3 = slice offset
 818 // x9 = temp buffer stride
 819 function \txfm\()16_1d_8x16_pass2_neon
 820         mov             x14, x30
 821 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 822         load            \i,  x2,  x9
 823 .endr
 824         cbz             x3,  1f
 825 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 826         load            \i,  x2,  x9
 827 .endr
 828 1:
 829
 830         add             x3,  x0,  x1
 831         lsl             x1,  x1,  #1
 832         bl              \txfm\()16
 833
 834         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
 835         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 836
 837         br              x14
 838 endfunc
 839 .endm
 840
 841 itxfm16_1d_funcs idct
 842 itxfm16_1d_funcs iadst
 843
 844 .macro itxfm_func16x16 txfm1, txfm2
 845 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 846 .ifc \txfm1\()_\txfm2,idct_idct
 847         cmp             w3,  #1
 848         b.eq            idct16x16_dc_add_neon
 849 .endif
 850         mov             x15, x30
 851         // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
 852 .ifnc \txfm1\()_\txfm2,idct_idct
 853         stp             d14, d15, [sp, #-0x10]!
 854         stp             d12, d13, [sp, #-0x10]!
 855         stp             d10, d11, [sp, #-0x10]!
 856         stp             d8,  d9,  [sp, #-0x10]!
 857 .endif
 858
 859         sub             sp,  sp,  #512
 860
 861         mov             x4,  x0
 862         mov             x5,  x1
 863         mov             x6,  x2
 864
 865         movrel          x10, idct_coeffs
 866 .ifnc \txfm1\()_\txfm2,idct_idct
 867         movrel          x11, iadst16_coeffs
 868 .endif
 869 .ifc \txfm1,idct
 870         ld1             {v0.8h,v1.8h}, [x10]
 871 .endif
 872         mov             x9,  #32
 873
 874 .ifc \txfm1\()_\txfm2,idct_idct
 875         cmp             w3,  #10
 876         b.le            idct16x16_quarter_add_neon
 877         cmp             w3,  #38
 878         b.le            idct16x16_half_add_neon
 879 .endif
 880
 881 .irp i, 0, 8
 882         add             x0,  sp,  #(\i*32)
 883 .ifc \txfm1\()_\txfm2,idct_idct
 884 .if \i == 8
 885         cmp             w3,  #38
 886         b.le            1f
 887 .endif
 888 .endif
 889         mov             x1,  #\i
 890         add             x2,  x6,  #(\i*2)
 891         bl              \txfm1\()16_1d_8x16_pass1_neon
 892 .endr
 893 .ifc \txfm1\()_\txfm2,iadst_idct
 894         ld1             {v0.8h,v1.8h}, [x10]
 895 .endif
 896
 897 .ifc \txfm1\()_\txfm2,idct_idct
 898         b               3f
 899 1:
 900         // Set v24-v31 to zero, for the in-register passthrough of
 901         // coefficients to pass 2. Since we only do two slices, this can
 902         // only ever happen for the second slice. So we only need to store
 903         // zeros to the temp buffer for the second half of the buffer.
 904         // Move x0 to the second half, and use x9 == 32 as increment.
 905         add             x0,  x0,  #16
 906 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 907         movi_v          \i,  .16b, #0
 908         st1             {v24.8h},  [x0], x9
 909 .endr
 910 3:
 911 .endif
 912
 913 .irp i, 0, 8
 914         add             x0,  x4,  #(\i)
 915         mov             x1,  x5
 916         add             x2,  sp,  #(\i*2)
 917         mov             x3,  #\i
 918         bl              \txfm2\()16_1d_8x16_pass2_neon
 919 .endr
 920
 921         add             sp,  sp,  #512
 922 .ifnc \txfm1\()_\txfm2,idct_idct
 923         ldp             d8,  d9,  [sp], 0x10
 924         ldp             d10, d11, [sp], 0x10
 925         ldp             d12, d13, [sp], 0x10
 926         ldp             d14, d15, [sp], 0x10
 927 .endif
 928         br              x15
 929 endfunc
 930 .endm
 931
 932 itxfm_func16x16 idct,  idct
 933 itxfm_func16x16 iadst, idct
 934 itxfm_func16x16 idct,  iadst
 935 itxfm_func16x16 iadst, iadst
 936
 937 function idct16_1d_8x16_pass1_quarter_neon
 938         mov             x14, x30
 939         movi            v2.8h, #0
 940 .irp i, 16, 17, 18, 19
 941         load_clear      \i,  x2,  x9
 942 .endr
 943
 944         bl              idct16_quarter
 945
 946         // Do two 8x8 transposes. Originally, v16-v31 contain the
 947         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 948         // transposed 8x8 blocks.
 949         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 950         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 951
 952         // Store the transposed 8x8 blocks horizontally.
 953         // The first 8x8 block is kept in registers for the second pass,
 954         // store the rest in the temp buffer.
 955         // Since only a 4x4 part of the input was nonzero, this means that
 956         // only 4 rows are nonzero after transposing, and the second pass
 957         // only reads the topmost 4 rows. Therefore only store the topmost
 958         // 4 rows.
 959         add             x0,  x0,  #16
 960 .irp i, 24, 25, 26, 27
 961         store           \i,  x0,  x9
 962 .endr
 963         br              x14
 964 endfunc
 965
 966 function idct16_1d_8x16_pass2_quarter_neon
 967         mov             x14, x30
 968         cbz             x3,  1f
 969 .irp i, 16, 17, 18, 19
 970         load            \i,  x2,  x9
 971 .endr
 972 1:
 973
 974         add             x3,  x0,  x1
 975         lsl             x1,  x1,  #1
 976         bl              idct16_quarter
 977
 978         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
 979         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 980
 981         br              x14
 982 endfunc
 983
 984 function idct16_1d_8x16_pass1_half_neon
 985         mov             x14, x30
 986         movi            v2.8h, #0
 987 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 988         load_clear      \i,  x2,  x9
 989 .endr
 990
 991         bl              idct16_half
 992
 993         // Do two 8x8 transposes. Originally, v16-v31 contain the
 994         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 995         // transposed 8x8 blocks.
 996         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 997         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 998
 999         // Store the transposed 8x8 blocks horizontally.
1000         // The first 8x8 block is kept in registers for the second pass,
1001         // store the rest in the temp buffer.
1002         add             x0,  x0,  #16
1003 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
1004         store           \i,  x0,  x9
1005 .endr
1006         br              x14
1007 endfunc
1008
1009 function idct16_1d_8x16_pass2_half_neon
1010         mov             x14, x30
1011         cbz             x3,  1f
1012 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1013         load            \i,  x2,  x9
1014 .endr
1015 1:
1016
1017         add             x3,  x0,  x1
1018         lsl             x1,  x1,  #1
1019         bl              idct16_half
1020
1021         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1022         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1023
1024         br              x14
1025 endfunc
1026
1027 .macro idct16_partial size
1028 function idct16x16_\size\()_add_neon
1029         add             x0,  sp,  #(0*32)
1030         add             x2,  x6,  #(0*2)
1031         bl              idct16_1d_8x16_pass1_\size\()_neon
1032 .irp i, 0, 8
1033         add             x0,  x4,  #(\i)
1034         mov             x1,  x5
1035         add             x2,  sp,  #(\i*2)
1036         mov             x3,  #\i
1037         bl              idct16_1d_8x16_pass2_\size\()_neon
1038 .endr
1039
1040         add             sp,  sp,  #512
1041         br              x15
1042 endfunc
1043 .endm
1044
1045 idct16_partial quarter
1046 idct16_partial half
1047
1048 function idct32x32_dc_add_neon
1049         movrel          x4,  idct_coeffs
1050         ld1             {v0.4h}, [x4]
1051
1052         movi            v1.4h,  #0
1053
1054         ld1             {v2.h}[0], [x2]
1055         smull           v2.4s,  v2.4h,  v0.h[0]
1056         rshrn           v2.4h,  v2.4s,  #14
1057         smull           v2.4s,  v2.4h,  v0.h[0]
1058         rshrn           v2.4h,  v2.4s,  #14
1059         dup             v2.8h,  v2.h[0]
1060         st1             {v1.h}[0], [x2]
1061
1062         srshr           v0.8h,  v2.8h,  #6
1063
1064         mov             x3,  x0
1065         mov             x4,  #32
1066 1:
1067         // Loop to add the constant v0 into all 32x32 outputs
1068         subs            x4,  x4,  #2
1069         ld1             {v1.16b,v2.16b},  [x0], x1
1070         uaddw           v16.8h, v0.8h,  v1.8b
1071         uaddw2          v17.8h, v0.8h,  v1.16b
1072         ld1             {v3.16b,v4.16b},  [x0], x1
1073         uaddw           v18.8h, v0.8h,  v2.8b
1074         uaddw2          v19.8h, v0.8h,  v2.16b
1075         uaddw           v20.8h, v0.8h,  v3.8b
1076         uaddw2          v21.8h, v0.8h,  v3.16b
1077         uaddw           v22.8h, v0.8h,  v4.8b
1078         uaddw2          v23.8h, v0.8h,  v4.16b
1079         sqxtun          v1.8b,  v16.8h
1080         sqxtun2         v1.16b, v17.8h
1081         sqxtun          v2.8b,  v18.8h
1082         sqxtun2         v2.16b, v19.8h
1083         sqxtun          v3.8b,  v20.8h
1084         sqxtun2         v3.16b, v21.8h
1085         st1             {v1.16b,v2.16b},  [x3], x1
1086         sqxtun          v4.8b,  v22.8h
1087         sqxtun2         v4.16b, v23.8h
1088         st1             {v3.16b,v4.16b},  [x3], x1
1089         b.ne            1b
1090
1091         ret
1092 endfunc
1093
1094 .macro idct32_end
1095         butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1096         butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1097         butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1098         butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1099         butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1100         butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1101         butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
1102         butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1103
1104         dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1105         dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1106         dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1107         dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1108
1109         butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1110         butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1111         butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1112         butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1113         butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1114         butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1115         butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1116         butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
1117
1118         dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
1119         dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1120         dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
1121         dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1122         ret
1123 .endm
1124
1125 function idct32_odd
1126         dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1127         dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1128         dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1129         dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1130         dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1131         dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1132         dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1133         dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1134
1135         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1136         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1137         butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1138         butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1139         butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1140         butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1141         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1142         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1143
1144         dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1145         dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1146         dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1147         dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1148         idct32_end
1149 endfunc
1150
1151 function idct32_odd_half
1152         dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1153         dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1154         dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1155         dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1156         dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1157         dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1158         dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1159         dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1160
1161         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1162         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1163         butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1164         butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1165         butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1166         butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1167         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1168         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1169
1170         dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1171         dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1172         dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1173         dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1174         idct32_end
1175 endfunc
1176
1177 function idct32_odd_quarter
1178         dsmull_h        v4,  v5,  v16, v8.h[0]
1179         dsmull_h        v28, v29, v19, v8.h[7]
1180         dsmull_h        v30, v31, v16, v8.h[1]
1181         dsmull_h        v22, v23, v17, v9.h[6]
1182         dsmull_h        v7,  v6,  v17, v9.h[7]
1183         dsmull_h        v26, v27, v19, v8.h[6]
1184         dsmull_h        v20, v21, v18, v9.h[0]
1185         dsmull_h        v24, v25, v18, v9.h[1]
1186
1187         neg             v28.4s, v28.4s
1188         neg             v29.4s, v29.4s
1189         neg             v7.4s,  v7.4s
1190         neg             v6.4s,  v6.4s
1191
1192         drshrn_h        v4,  v4,  v5,  #14
1193         drshrn_h        v5,  v28, v29, #14
1194         drshrn_h        v29, v30, v31, #14
1195         drshrn_h        v28, v22, v23, #14
1196         drshrn_h        v7,  v7,  v6,  #14
1197         drshrn_h        v31, v26, v27, #14
1198         drshrn_h        v6,  v20, v21, #14
1199         drshrn_h        v30, v24, v25, #14
1200
1201         dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
1202         dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
1203         drshrn_h        v23, v16, v17, #14
1204         drshrn_h        v24, v18, v19, #14
1205         neg             v20.4s, v20.4s
1206         neg             v21.4s, v21.4s
1207         drshrn_h        v27, v27, v26, #14
1208         drshrn_h        v20, v20, v21, #14
1209         dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
1210         drshrn_h        v21, v16, v17, #14
1211         drshrn_h        v26, v18, v19, #14
1212         dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
1213         drshrn_h        v25, v16, v17, #14
1214         neg             v18.4s, v18.4s
1215         neg             v19.4s, v19.4s
1216         drshrn_h        v22, v18, v19, #14
1217
1218         idct32_end
1219 endfunc
1220
1221 .macro idct32_funcs suffix
1222 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1223 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1224 // a normal IDCT16 with every other input component (the even ones, with
1225 // each output written twice), followed by a separate 16-point IDCT
1226 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1227 // x0 = dst (temp buffer)
1228 // x1 = unused
1229 // x2 = src
1230 // x9 = double input stride
1231 function idct32_1d_8x32_pass1\suffix\()_neon
1232         mov             x14, x30
1233         movi            v2.8h,  #0
1234
1235         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1236 .ifb \suffix
1237 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1238         load_clear      \i, x2, x9
1239 .endr
1240 .endif
1241 .ifc \suffix,_quarter
1242 .irp i, 16, 17, 18, 19
1243         load_clear      \i, x2, x9
1244 .endr
1245 .endif
1246 .ifc \suffix,_half
1247 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1248         load_clear      \i, x2, x9
1249 .endr
1250 .endif
1251
1252         bl              idct16\suffix
1253
1254         // Do two 8x8 transposes. Originally, v16-v31 contain the
1255         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1256         // two transposed 8x8 blocks.
1257         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1258         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1259
1260         // Store the registers a, b horizontally, followed by the
1261         // same registers b, a mirrored.
1262 .macro store_rev a, b
1263         // There's no rev128 instruction, but we reverse each 64 bit
1264         // half, and then flip them using an ext with 8 bytes offset.
1265         rev64           v3.8h, \b
1266         st1             {\a},  [x0], #16
1267         rev64           v2.8h, \a
1268         ext             v3.16b, v3.16b, v3.16b, #8
1269         st1             {\b},  [x0], #16
1270         ext             v2.16b, v2.16b, v2.16b, #8
1271         st1             {v3.8h},  [x0], #16
1272         st1             {v2.8h},  [x0], #16
1273 .endm
1274         store_rev       v16.8h, v24.8h
1275         store_rev       v17.8h, v25.8h
1276         store_rev       v18.8h, v26.8h
1277         store_rev       v19.8h, v27.8h
1278         store_rev       v20.8h, v28.8h
1279         store_rev       v21.8h, v29.8h
1280         store_rev       v22.8h, v30.8h
1281         store_rev       v23.8h, v31.8h
1282         sub             x0,  x0,  #512
1283 .purgem store_rev
1284
1285         // Move x2 back to the start of the input, and move
1286         // to the first odd row
1287 .ifb \suffix
1288         sub             x2,  x2,  x9, lsl #4
1289 .endif
1290 .ifc \suffix,_quarter
1291         sub             x2,  x2,  x9, lsl #2
1292 .endif
1293 .ifc \suffix,_half
1294         sub             x2,  x2,  x9, lsl #3
1295 .endif
1296         add             x2,  x2,  #64
1297
1298         movi            v2.8h,  #0
1299         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1300 .ifb \suffix
1301 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1302         load_clear      \i, x2, x9
1303 .endr
1304 .endif
1305 .ifc \suffix,_quarter
1306 .irp i, 16, 17, 18, 19
1307         load_clear      \i, x2, x9
1308 .endr
1309 .endif
1310 .ifc \suffix,_half
1311 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1312         load_clear      \i, x2, x9
1313 .endr
1314 .endif
1315
1316         bl              idct32_odd\suffix
1317
1318         transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1319         transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1320
1321         // Store the registers a, b horizontally,
1322         // adding into the output first, and the mirrored,
1323         // subtracted from the output.
1324 .macro store_rev a, b
1325         ld1             {v4.8h},  [x0]
1326         rev64           v3.8h, \b
1327         add             v4.8h, v4.8h, \a
1328         rev64           v2.8h, \a
1329         st1             {v4.8h},  [x0], #16
1330         ext             v3.16b, v3.16b, v3.16b, #8
1331         ld1             {v5.8h},  [x0]
1332         ext             v2.16b, v2.16b, v2.16b, #8
1333         add             v5.8h, v5.8h, \b
1334         st1             {v5.8h},  [x0], #16
1335         ld1             {v6.8h},  [x0]
1336         sub             v6.8h, v6.8h, v3.8h
1337         st1             {v6.8h},  [x0], #16
1338         ld1             {v7.8h},  [x0]
1339         sub             v7.8h, v7.8h, v2.8h
1340         st1             {v7.8h},  [x0], #16
1341 .endm
1342
1343         store_rev       v31.8h, v23.8h
1344         store_rev       v30.8h, v22.8h
1345         store_rev       v29.8h, v21.8h
1346         store_rev       v28.8h, v20.8h
1347         store_rev       v27.8h, v19.8h
1348         store_rev       v26.8h, v18.8h
1349         store_rev       v25.8h, v17.8h
1350         store_rev       v24.8h, v16.8h
1351 .purgem store_rev
1352         br              x14
1353 endfunc
1354
1355 // This is mostly the same as 8x32_pass1, but without the transpose,
1356 // and use the source as temp buffer between the two idct passes, and
1357 // add into the destination.
1358 // x0 = dst
1359 // x1 = dst stride
1360 // x2 = src (temp buffer)
1361 // x7 = negative double temp buffer stride
1362 // x9 = double temp buffer stride
1363 function idct32_1d_8x32_pass2\suffix\()_neon
1364         mov             x14, x30
1365         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1366 .ifb \suffix
1367 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1368         load            \i, x2, x9
1369 .endr
1370         sub             x2,  x2,  x9, lsl #4
1371 .endif
1372 .ifc \suffix,_quarter
1373 .irp i, 16, 17, 18, 19
1374         load            \i, x2, x9
1375 .endr
1376         sub             x2,  x2,  x9, lsl #2
1377 .endif
1378 .ifc \suffix,_half
1379 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1380         load            \i, x2, x9
1381 .endr
1382         sub             x2,  x2,  x9, lsl #3
1383 .endif
1384
1385         bl              idct16\suffix
1386
1387 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1388         store           \i, x2, x9
1389 .endr
1390
1391         sub             x2,  x2,  x9, lsl #4
1392         add             x2,  x2,  #64
1393
1394         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1395 .ifb \suffix
1396 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1397         load            \i, x2, x9
1398 .endr
1399         sub             x2,  x2,  x9, lsl #4
1400 .endif
1401 .ifc \suffix,_quarter
1402 .irp i, 16, 17, 18, 19
1403         load            \i, x2, x9
1404 .endr
1405         sub             x2,  x2,  x9, lsl #2
1406 .endif
1407 .ifc \suffix,_half
1408 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1409         load            \i, x2, x9
1410 .endr
1411         sub             x2,  x2,  x9, lsl #3
1412 .endif
1413         sub             x2,  x2,  #64
1414
1415         bl              idct32_odd\suffix
1416
1417 .macro load_acc_store a, b, c, d, neg=0
1418 .if \neg == 0
1419         ld1             {v4.8h},  [x2], x9
1420         ld1             {v5.8h},  [x2], x9
1421         add             v4.8h, v4.8h, \a
1422         ld1             {v6.8h},  [x2], x9
1423         add             v5.8h, v5.8h, \b
1424         ld1             {v7.8h},  [x2], x9
1425         add             v6.8h, v6.8h, \c
1426         add             v7.8h, v7.8h, \d
1427 .else
1428         ld1             {v4.8h},  [x2], x7
1429         ld1             {v5.8h},  [x2], x7
1430         sub             v4.8h, v4.8h, \a
1431         ld1             {v6.8h},  [x2], x7
1432         sub             v5.8h, v5.8h, \b
1433         ld1             {v7.8h},  [x2], x7
1434         sub             v6.8h, v6.8h, \c
1435         sub             v7.8h, v7.8h, \d
1436 .endif
1437         ld1             {v10.8b}, [x0], x1
1438         ld1             {v11.8b}, [x0], x1
1439         srshr           v4.8h, v4.8h, #6
1440         ld1             {v2.8b}, [x0], x1
1441         srshr           v5.8h, v5.8h, #6
1442         uaddw           v4.8h, v4.8h, v10.8b
1443         ld1             {v3.8b}, [x0], x1
1444         srshr           v6.8h, v6.8h, #6
1445         uaddw           v5.8h, v5.8h, v11.8b
1446         srshr           v7.8h, v7.8h, #6
1447         sub             x0,  x0,  x1, lsl #2
1448         uaddw           v6.8h, v6.8h, v2.8b
1449         sqxtun          v4.8b, v4.8h
1450         uaddw           v7.8h, v7.8h, v3.8b
1451         sqxtun          v5.8b, v5.8h
1452         st1             {v4.8b}, [x0], x1
1453         sqxtun          v6.8b, v6.8h
1454         st1             {v5.8b}, [x0], x1
1455         sqxtun          v7.8b, v7.8h
1456         st1             {v6.8b}, [x0], x1
1457         st1             {v7.8b}, [x0], x1
1458 .endm
1459         load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
1460         load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
1461         load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
1462         load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
1463         sub             x2,  x2,  x9
1464         load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
1465         load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
1466         load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
1467         load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
1468 .purgem load_acc_store
1469         br              x14
1470 endfunc
1471 .endm
1472
1473 idct32_funcs
1474 idct32_funcs _quarter
1475 idct32_funcs _half
1476
1477 const min_eob_idct_idct_32, align=4
1478         .short  0, 34, 135, 336
1479 endconst
1480
1481 function ff_vp9_idct_idct_32x32_add_neon, export=1
1482         cmp             w3,  #1
1483         b.eq            idct32x32_dc_add_neon
1484
1485         movrel          x10, idct_coeffs
1486
1487         mov             x15, x30
1488
1489         stp             d10, d11, [sp, #-0x10]!
1490         stp             d8,  d9,  [sp, #-0x10]!
1491
1492         sub             sp,  sp,  #2048
1493
1494         mov             x4,  x0
1495         mov             x5,  x1
1496         mov             x6,  x2
1497
1498         // Double stride of the input, since we only read every other line
1499         mov             x9,  #128
1500         neg             x7,  x9
1501
1502         ld1             {v0.8h,v1.8h}, [x10], #32
1503         ld1             {v8.8h,v9.8h}, [x10]
1504
1505         cmp             w3,  #34
1506         b.le            idct32x32_quarter_add_neon
1507         cmp             w3,  #135
1508         b.le            idct32x32_half_add_neon
1509
1510         movrel          x12, min_eob_idct_idct_32, 2
1511
1512 .irp i, 0, 8, 16, 24
1513         add             x0,  sp,  #(\i*64)
1514 .if \i > 0
1515         ldrh            w1,  [x12], #2
1516         cmp             w3,  w1
1517         mov             x1,  #(32 - \i)/4
1518         b.le            1f
1519 .endif
1520         add             x2,  x6,  #(\i*2)
1521         bl              idct32_1d_8x32_pass1_neon
1522 .endr
1523         b               3f
1524
1525 1:
1526         // Write zeros to the temp buffer for pass 2
1527         movi            v16.8h,  #0
1528         movi            v17.8h,  #0
1529         movi            v18.8h,  #0
1530         movi            v19.8h,  #0
1531 2:
1532         subs            x1,  x1,  #1
1533 .rept 4
1534         st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
1535 .endr
1536         b.ne            2b
1537 3:
1538 .irp i, 0, 8, 16, 24
1539         add             x0,  x4,  #(\i)
1540         mov             x1,  x5
1541         add             x2,  sp,  #(\i*2)
1542         bl              idct32_1d_8x32_pass2_neon
1543 .endr
1544
1545         add             sp,  sp,  #2048
1546
1547         ldp             d8,  d9,  [sp], 0x10
1548         ldp             d10, d11, [sp], 0x10
1549
1550         br              x15
1551 endfunc
1552
1553 .macro idct32_partial size
1554 function idct32x32_\size\()_add_neon
1555         add             x0,  sp,  #(0*64)
1556         add             x2,  x6,  #(0*2)
1557         bl              idct32_1d_8x32_pass1_\size\()_neon
1558 .ifc \size,half
1559         add             x0,  sp,  #(8*64)
1560         add             x2,  x6,  #(8*2)
1561         bl              idct32_1d_8x32_pass1_\size\()_neon
1562 .endif
1563 .irp i, 0, 8, 16, 24
1564         add             x0,  x4,  #(\i)
1565         mov             x1,  x5
1566         add             x2,  sp,  #(\i*2)
1567         bl              idct32_1d_8x32_pass2_\size\()_neon
1568 .endr
1569
1570         add             sp,  sp,  #2048
1571
1572         ldp             d8,  d9,  [sp], 0x10
1573         ldp             d10, d11, [sp], 0x10
1574
1575         br              x15
1576 endfunc
1577 .endm
1578
1579 idct32_partial quarter
1580 idct32_partial half