git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_neon.S

   1 /*
   2  * Copyright (c) 2016 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22 #include "neon.S"
  23
  24 const itxfm4_coeffs, align=4
  25         .short  11585, 6270, 15137, 0
  26 iadst4_coeffs:
  27         .short  5283, 15212, 9929, 13377
  28 endconst
  29
  30 const iadst8_coeffs, align=4
  31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
  32 idct_coeffs:
  33         .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
  34         .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
  35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
  36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
  37 endconst
  38
  39 const iadst16_coeffs, align=4
  40         .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
  41         .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
  42 endconst
  43
  44 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
  45 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
  46 // in/out are .8h registers; this can do with 4 temp registers, but is
  47 // more efficient if 6 temp registers are available.
  48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
  49 .if \neg > 0
  50         neg             \tmp4\().4h, v0.4h
  51 .endif
  52         add             \tmp1\().8h, \in1\().8h,  \in2\().8h
  53         sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
  54 .if \neg > 0
  55         smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
  56         smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
  57 .else
  58         smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
  59         smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
  60 .endif
  61 .ifb \tmp5
  62         rshrn           \out1\().4h, \tmp3\().4s, #14
  63         rshrn2          \out1\().8h, \tmp4\().4s, #14
  64         smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
  65         smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
  66         rshrn           \out2\().4h, \tmp3\().4s, #14
  67         rshrn2          \out2\().8h, \tmp4\().4s, #14
  68 .else
  69         smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
  70         smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
  71         rshrn           \out1\().4h, \tmp3\().4s, #14
  72         rshrn2          \out1\().8h, \tmp4\().4s, #14
  73         rshrn           \out2\().4h, \tmp5\().4s, #14
  74         rshrn2          \out2\().8h, \tmp6\().4s, #14
  75 .endif
  76 .endm
  77
  78 // out1,out2 = in1 * coef1 - in2 * coef2
  79 // out3,out4 = in1 * coef2 + in2 * coef1
  80 // out are 4 x .4s registers, in are 2 x .8h registers
  81 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
  82         smull           \out1\().4s, \in1\().4h, \coef1
  83         smull2          \out2\().4s, \in1\().8h, \coef1
  84         smull           \out3\().4s, \in1\().4h, \coef2
  85         smull2          \out4\().4s, \in1\().8h, \coef2
  86         smlsl           \out1\().4s, \in2\().4h, \coef2
  87         smlsl2          \out2\().4s, \in2\().8h, \coef2
  88         smlal           \out3\().4s, \in2\().4h, \coef1
  89         smlal2          \out4\().4s, \in2\().8h, \coef1
  90 .endm
  91
  92 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
  93 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
  94 // inout are 2 x .8h registers
  95 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
  96         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
  97 .if \neg > 0
  98         neg             \tmp3\().4s, \tmp3\().4s
  99         neg             \tmp4\().4s, \tmp4\().4s
 100 .endif
 101         rshrn           \inout1\().4h, \tmp1\().4s,  #14
 102         rshrn2          \inout1\().8h, \tmp2\().4s,  #14
 103         rshrn           \inout2\().4h, \tmp3\().4s,  #14
 104         rshrn2          \inout2\().8h, \tmp4\().4s,  #14
 105 .endm
 106
 107 // out1 = in1 + in2
 108 // out2 = in1 - in2
 109 .macro butterfly_8h out1, out2, in1, in2
 110         add             \out1\().8h, \in1\().8h, \in2\().8h
 111         sub             \out2\().8h, \in1\().8h, \in2\().8h
 112 .endm
 113
 114 // out1 = in1 - in2
 115 // out2 = in1 + in2
 116 .macro butterfly_8h_r out1, out2, in1, in2
 117         sub             \out1\().8h, \in1\().8h, \in2\().8h
 118         add             \out2\().8h, \in1\().8h, \in2\().8h
 119 .endm
 120
 121 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
 122 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
 123 // out are 2 x .8h registers, in are 4 x .4s registers
 124 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
 125         add             \tmp1\().4s, \in1\().4s, \in3\().4s
 126         add             \tmp2\().4s, \in2\().4s, \in4\().4s
 127         sub             \tmp3\().4s, \in1\().4s, \in3\().4s
 128         sub             \tmp4\().4s, \in2\().4s, \in4\().4s
 129         rshrn           \out1\().4h, \tmp1\().4s,  #14
 130         rshrn2          \out1\().8h, \tmp2\().4s,  #14
 131         rshrn           \out2\().4h, \tmp3\().4s,  #14
 132         rshrn2          \out2\().8h, \tmp4\().4s,  #14
 133 .endm
 134
 135 .macro iwht4 c0, c1, c2, c3
 136         add             \c0\().4h, \c0\().4h, \c1\().4h
 137         sub             v17.4h,    \c2\().4h, \c3\().4h
 138         sub             v16.4h,    \c0\().4h, v17.4h
 139         sshr            v16.4h,    v16.4h,    #1
 140         sub             \c2\().4h, v16.4h,    \c1\().4h
 141         sub             \c1\().4h, v16.4h,    \c3\().4h
 142         add             \c3\().4h, v17.4h,    \c2\().4h
 143         sub             \c0\().4h, \c0\().4h, \c1\().4h
 144 .endm
 145
 146 .macro idct4 c0, c1, c2, c3
 147         smull           v22.4s,    \c1\().4h, v0.h[2]
 148         smull           v20.4s,    \c1\().4h, v0.h[1]
 149         add             v16.4h,    \c0\().4h, \c2\().4h
 150         sub             v17.4h,    \c0\().4h, \c2\().4h
 151         smlal           v22.4s,    \c3\().4h, v0.h[1]
 152         smull           v18.4s,    v16.4h,    v0.h[0]
 153         smull           v19.4s,    v17.4h,    v0.h[0]
 154         smlsl           v20.4s,    \c3\().4h, v0.h[2]
 155         rshrn           v22.4h,    v22.4s,    #14
 156         rshrn           v18.4h,    v18.4s,    #14
 157         rshrn           v19.4h,    v19.4s,    #14
 158         rshrn           v20.4h,    v20.4s,    #14
 159         add             \c0\().4h, v18.4h,    v22.4h
 160         sub             \c3\().4h, v18.4h,    v22.4h
 161         add             \c1\().4h, v19.4h,    v20.4h
 162         sub             \c2\().4h, v19.4h,    v20.4h
 163 .endm
 164
 165 .macro iadst4 c0, c1, c2, c3
 166         smull           v16.4s,    \c0\().4h, v0.h[4]
 167         smlal           v16.4s,    \c2\().4h, v0.h[5]
 168         smlal           v16.4s,    \c3\().4h, v0.h[6]
 169         smull           v17.4s,    \c0\().4h, v0.h[6]
 170         smlsl           v17.4s,    \c2\().4h, v0.h[4]
 171         sub             \c0\().4h, \c0\().4h, \c2\().4h
 172         smlsl           v17.4s,    \c3\().4h, v0.h[5]
 173         add             \c0\().4h, \c0\().4h, \c3\().4h
 174         smull           v19.4s,    \c1\().4h, v0.h[7]
 175         smull           v18.4s,    \c0\().4h, v0.h[7]
 176         add             v20.4s,    v16.4s,    v19.4s
 177         add             v21.4s,    v17.4s,    v19.4s
 178         rshrn           \c0\().4h, v20.4s,    #14
 179         add             v16.4s,    v16.4s,    v17.4s
 180         rshrn           \c1\().4h, v21.4s,     #14
 181         sub             v16.4s,    v16.4s,    v19.4s
 182         rshrn           \c2\().4h, v18.4s,    #14
 183         rshrn           \c3\().4h, v16.4s,    #14
 184 .endm
 185
 186 // The public functions in this file have got the following signature:
 187 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 188
 189 .macro itxfm_func4x4 txfm1, txfm2
 190 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 191 .ifc \txfm1,\txfm2
 192 .ifc \txfm1,idct
 193         movrel          x4,  itxfm4_coeffs
 194         ld1             {v0.4h}, [x4]
 195 .endif
 196 .ifc \txfm1,iadst
 197         movrel          x4,  iadst4_coeffs
 198         ld1             {v0.d}[1], [x4]
 199 .endif
 200 .else
 201         movrel          x4,  itxfm4_coeffs
 202         ld1             {v0.8h}, [x4]
 203 .endif
 204
 205         movi            v31.8h, #0
 206 .ifc \txfm1\()_\txfm2,idct_idct
 207         cmp             w3,  #1
 208         b.ne            1f
 209         // DC-only for idct/idct
 210         ld1r            {v2.4h},  [x2]
 211         smull           v2.4s,  v2.4h, v0.h[0]
 212         rshrn           v2.4h,  v2.4s, #14
 213         smull           v2.4s,  v2.4h, v0.h[0]
 214         rshrn           v2.4h,  v2.4s, #14
 215         st1             {v31.h}[0], [x2]
 216         dup             v4.4h,  v2.h[0]
 217         mov             v5.16b, v4.16b
 218         mov             v6.16b, v4.16b
 219         mov             v7.16b, v4.16b
 220         b               2f
 221 .endif
 222
 223 1:
 224         ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
 225         st1             {v31.8h}, [x2], #16
 226
 227 .ifc \txfm1,iwht
 228         sshr            v4.4h,  v4.4h,  #2
 229         sshr            v5.4h,  v5.4h,  #2
 230         sshr            v6.4h,  v6.4h,  #2
 231         sshr            v7.4h,  v7.4h,  #2
 232 .endif
 233
 234         \txfm1\()4      v4,  v5,  v6,  v7
 235
 236         st1             {v31.8h}, [x2], #16
 237         // Transpose 4x4 with 16 bit elements
 238         transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 239
 240         \txfm2\()4      v4,  v5,  v6,  v7
 241 2:
 242         ld1r            {v0.2s},   [x0], x1
 243         ld1r            {v1.2s},   [x0], x1
 244 .ifnc \txfm1,iwht
 245         srshr           v4.4h,  v4.4h,  #4
 246         srshr           v5.4h,  v5.4h,  #4
 247         srshr           v6.4h,  v6.4h,  #4
 248         srshr           v7.4h,  v7.4h,  #4
 249 .endif
 250         uaddw           v4.8h,  v4.8h,  v0.8b
 251         uaddw           v5.8h,  v5.8h,  v1.8b
 252         ld1r            {v2.2s},   [x0], x1
 253         ld1r            {v3.2s},   [x0], x1
 254         sqxtun          v0.8b,  v4.8h
 255         sqxtun          v1.8b,  v5.8h
 256         sub             x0,  x0,  x1, lsl #2
 257
 258         uaddw           v6.8h,  v6.8h,  v2.8b
 259         uaddw           v7.8h,  v7.8h,  v3.8b
 260         st1             {v0.s}[0],  [x0], x1
 261         sqxtun          v2.8b,  v6.8h
 262         sqxtun          v3.8b,  v7.8h
 263
 264         st1             {v1.s}[0],  [x0], x1
 265         st1             {v2.s}[0],  [x0], x1
 266         st1             {v3.s}[0],  [x0], x1
 267
 268         ret
 269 endfunc
 270 .endm
 271
 272 itxfm_func4x4 idct,  idct
 273 itxfm_func4x4 iadst, idct
 274 itxfm_func4x4 idct,  iadst
 275 itxfm_func4x4 iadst, iadst
 276 itxfm_func4x4 iwht,  iwht
 277
 278
 279 .macro idct8
 280         dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
 281         dmbutterfly     v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
 282         dmbutterfly     v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
 283         dmbutterfly     v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
 284
 285         butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
 286         butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
 287         butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
 288         butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
 289
 290         dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
 291
 292         butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
 293         butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
 294         butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
 295         butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
 296 .endm
 297
 298 .macro iadst8
 299         dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
 300         dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
 301         dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
 302         dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
 303
 304         dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
 305         dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
 306         dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
 307         dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
 308
 309         butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
 310         butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
 311         neg             v23.8h,   v23.8h  // v23 = out[7]
 312
 313         dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
 314         neg             v19.8h,   v19.8h  // v19 = out[3]
 315
 316         dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[1], v0.h[2]   // v26,v27 = t5a, v28,v29 = t4a
 317         dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[2], v0.h[1]   // v2,v3   = t6a, v4,v5   = t7a
 318
 319         dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
 320         dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
 321         neg             v17.8h,   v17.8h  // v17 = out[1]
 322
 323         dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
 324         neg             v21.8h,   v21.8h  // v21 = out[5]
 325 .endm
 326
 327
 328 .macro itxfm_func8x8 txfm1, txfm2
 329 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 330         // The iadst also uses a few coefficients from
 331         // idct, so those always need to be loaded.
 332 .ifc \txfm1\()_\txfm2,idct_idct
 333         movrel          x4,  idct_coeffs
 334         ld1             {v0.8h}, [x4]
 335 .else
 336         movrel          x4, iadst8_coeffs
 337         ld1             {v1.8h}, [x4], #16
 338         ld1             {v0.8h}, [x4]
 339 .endif
 340
 341         movi            v2.16b, #0
 342         movi            v3.16b, #0
 343         movi            v4.16b, #0
 344         movi            v5.16b, #0
 345
 346 .ifc \txfm1\()_\txfm2,idct_idct
 347         cmp             w3,  #1
 348         b.ne            1f
 349         // DC-only for idct/idct
 350         ld1r            {v2.4h},  [x2]
 351         smull           v2.4s,  v2.4h, v0.h[0]
 352         rshrn           v2.4h,  v2.4s, #14
 353         smull           v2.4s,  v2.4h, v0.h[0]
 354         rshrn           v2.4h,  v2.4s, #14
 355         st1             {v3.h}[0],  [x2]
 356         dup             v16.8h,  v2.h[0]
 357         mov             v17.16b, v16.16b
 358         mov             v18.16b, v16.16b
 359         mov             v19.16b, v16.16b
 360         mov             v20.16b, v16.16b
 361         mov             v21.16b, v16.16b
 362         mov             v22.16b, v16.16b
 363         mov             v23.16b, v16.16b
 364         b               2f
 365 .endif
 366 1:
 367         ld1             {v16.16b,v17.16b,v18.16b,v19.16b},  [x2], #64
 368         ld1             {v20.16b,v21.16b,v22.16b,v23.16b},  [x2], #64
 369         sub             x2,  x2,  #128
 370         st1             {v2.16b,v3.16b,v4.16b,v5.16b},  [x2], #64
 371         st1             {v2.16b,v3.16b,v4.16b,v5.16b},  [x2], #64
 372
 373         \txfm1\()8
 374
 375         // Transpose 8x8 with 16 bit elements
 376         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 377
 378         \txfm2\()8
 379 2:
 380         mov             x3,  x0
 381         // Add into the destination
 382         ld1             {v0.8b},  [x0], x1
 383         srshr           v16.8h, v16.8h, #5
 384         ld1             {v1.8b},  [x0], x1
 385         srshr           v17.8h, v17.8h, #5
 386         ld1             {v2.8b},  [x0], x1
 387         srshr           v18.8h, v18.8h, #5
 388         uaddw           v16.8h, v16.8h, v0.8b
 389         ld1             {v3.8b},  [x0], x1
 390         srshr           v19.8h, v19.8h, #5
 391         uaddw           v17.8h, v17.8h, v1.8b
 392         ld1             {v4.8b},  [x0], x1
 393         srshr           v20.8h, v20.8h, #5
 394         uaddw           v18.8h, v18.8h, v2.8b
 395         sqxtun          v0.8b,  v16.8h
 396         ld1             {v5.8b},  [x0], x1
 397         srshr           v21.8h, v21.8h, #5
 398         uaddw           v19.8h, v19.8h, v3.8b
 399         sqxtun          v1.8b,  v17.8h
 400         ld1             {v6.8b},  [x0], x1
 401         srshr           v22.8h, v22.8h, #5
 402         uaddw           v20.8h, v20.8h, v4.8b
 403         sqxtun          v2.8b,  v18.8h
 404         ld1             {v7.8b},  [x0], x1
 405         srshr           v23.8h, v23.8h, #5
 406         uaddw           v21.8h, v21.8h, v5.8b
 407         sqxtun          v3.8b,  v19.8h
 408
 409         st1             {v0.8b},  [x3], x1
 410         uaddw           v22.8h, v22.8h, v6.8b
 411         st1             {v1.8b},  [x3], x1
 412         sqxtun          v4.8b,  v20.8h
 413         st1             {v2.8b},  [x3], x1
 414         uaddw           v23.8h, v23.8h, v7.8b
 415         st1             {v3.8b},  [x3], x1
 416         sqxtun          v5.8b,  v21.8h
 417         st1             {v4.8b},  [x3], x1
 418         sqxtun          v6.8b,  v22.8h
 419         st1             {v5.8b},  [x3], x1
 420         sqxtun          v7.8b,  v23.8h
 421
 422         st1             {v6.8b},  [x3], x1
 423         st1             {v7.8b},  [x3], x1
 424
 425         ret
 426 endfunc
 427 .endm
 428
 429 itxfm_func8x8 idct,  idct
 430 itxfm_func8x8 iadst, idct
 431 itxfm_func8x8 idct,  iadst
 432 itxfm_func8x8 iadst, iadst
 433
 434
 435 function idct16x16_dc_add_neon
 436         movrel          x4, idct_coeffs
 437         ld1             {v0.4h}, [x4]
 438
 439         movi            v1.4h, #0
 440
 441         ld1r            {v2.4h}, [x2]
 442         smull           v2.4s,  v2.4h, v0.h[0]
 443         rshrn           v2.4h,  v2.4s, #14
 444         smull           v2.4s,  v2.4h, v0.h[0]
 445         rshrn           v2.4h,  v2.4s, #14
 446         dup             v2.8h,  v2.h[0]
 447         st1             {v1.h}[0], [x2]
 448
 449         srshr           v2.8h, v2.8h, #6
 450
 451         mov             x4, #16
 452 1:
 453         // Loop to add the constant from v2 into all 16x16 outputs
 454         ld1             {v3.16b},  [x0]
 455         uaddw           v4.8h,  v2.8h,  v3.8b
 456         uaddw2          v5.8h,  v2.8h,  v3.16b
 457         sqxtun          v4.8b,  v4.8h
 458         sqxtun2         v4.16b, v5.8h
 459         st1             {v4.16b},  [x0], x1
 460         subs            x4,  x4,  #1
 461         b.ne            1b
 462
 463         ret
 464 endfunc
 465
 466 .macro idct16
 467         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
 468         dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
 469         dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
 470         dmbutterfly     v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
 471         dmbutterfly     v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
 472         dmbutterfly     v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
 473         dmbutterfly     v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
 474         dmbutterfly     v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
 475
 476         butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 477         butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 478         butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 479         butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 480         butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 481         butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 482         butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 483         butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 484
 485         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 486         dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 487         dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 488
 489         butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
 490         butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
 491         butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
 492         butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
 493         butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
 494         butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
 495         butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
 496         butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
 497
 498         dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
 499         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
 500
 501         butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
 502         butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
 503         butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
 504         butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
 505         butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
 506         butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
 507         butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
 508         butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
 509 .endm
 510
 511 .macro iadst16
 512         ld1             {v0.8h,v1.8h}, [x11]
 513
 514         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
 515         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.h[1], v1.h[0]   // v10,v11 = t9,   v8,v9   = t8
 516         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
 517         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
 518         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
 519
 520         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.h[3], v1.h[2]   // v6,v7   = t11,  v4,v5   = t10
 521         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
 522         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v0.h[5], v0.h[4]   // v10,v11 = t5,   v8,v9   = t4
 523         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
 524
 525         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
 526         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
 527         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v0.h[7], v0.h[6]   // v6,v7   = t7,   v4,v5   = t6
 528         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
 529
 530         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
 531         ld1             {v0.8h}, [x10]
 532         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
 533         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4]   // v14,v15 = t9,   v12,v13 = t8
 534         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
 535
 536         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[4], v0.h[3]   // v4,v5   = t12,  v6,v7   = t13
 537         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
 538         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[5], v0.h[6]   // v10,v11 = t11,  v8,v9   = t10
 539         butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
 540         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
 541
 542         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5]   // v12,v13 = t14,  v14,v15 = t15
 543         butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
 544         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
 545         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
 546
 547         butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
 548         butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
 549
 550         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[1], v0.h[2]   // v10,v11 = t13,  v8,v9   = t12
 551         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1]   // v12,v13 = t14,  v14,v15 = t15
 552
 553         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
 554         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
 555         neg             v29.8h, v29.8h                   // v29 = out[13]
 556
 557         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[1], v0.h[2]   // v10,v11 = t5a,  v8,v9   = t4a
 558         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[2], v0.h[1]   // v12,v13 = t6a,  v14,v15 = t7a
 559
 560         butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
 561         butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
 562
 563         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
 564         neg             v19.8h, v19.8h                   // v19 = out[3]
 565         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
 566
 567         butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
 568         butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
 569
 570         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
 571         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
 572         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
 573         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
 574
 575         neg             v31.8h,  v5.8h                    // v31 = out[15]
 576         neg             v17.8h,  v3.8h                    // v17 = out[1]
 577
 578         mov             v16.16b, v2.16b
 579         mov             v30.16b, v4.16b
 580 .endm
 581
 582 // Helper macros; we can't use these expressions directly within
 583 // e.g. .irp due to the extra concatenation \(). Therefore wrap
 584 // them in macros to allow using .irp below.
 585 .macro load i, src, inc
 586         ld1             {v\i\().8h},  [\src], \inc
 587 .endm
 588 .macro store i, dst, inc
 589         st1             {v\i\().8h},  [\dst], \inc
 590 .endm
 591 .macro movi_v i, size, imm
 592         movi            v\i\()\size,  \imm
 593 .endm
 594 .macro load_clear i, src, inc
 595         ld1             {v\i\().8h}, [\src]
 596         st1             {v2.8h},  [\src], \inc
 597 .endm
 598
 599 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 600 // transpose into a horizontal 16x8 slice and store.
 601 // x0 = dst (temp buffer)
 602 // x1 = slice offset
 603 // x2 = src
 604 // x9 = input stride
 605 .macro itxfm16_1d_funcs txfm
 606 function \txfm\()16_1d_8x16_pass1_neon
 607         movi            v2.8h, #0
 608 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 609         load_clear      \i,  x2,  x9
 610 .endr
 611
 612         \txfm\()16
 613
 614         // Do two 8x8 transposes. Originally, v16-v31 contain the
 615         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
 616         // transposed 8x8 blocks.
 617         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 618         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 619
 620         // Store the transposed 8x8 blocks horizontally.
 621         cmp             x1,  #8
 622         b.eq            1f
 623 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 624         store           \i,  x0,  #16
 625 .endr
 626         ret
 627 1:
 628         // Special case: For the last input column (x1 == 8),
 629         // which would be stored as the last row in the temp buffer,
 630         // don't store the first 8x8 block, but keep it in registers
 631         // for the first slice of the second pass (where it is the
 632         // last 8x8 block).
 633 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 634         add             x0,  x0,  #16
 635         store           \i,  x0,  #16
 636 .endr
 637         mov             v24.16b, v16.16b
 638         mov             v25.16b, v17.16b
 639         mov             v26.16b, v18.16b
 640         mov             v27.16b, v19.16b
 641         mov             v28.16b, v20.16b
 642         mov             v29.16b, v21.16b
 643         mov             v30.16b, v22.16b
 644         mov             v31.16b, v23.16b
 645         ret
 646 endfunc
 647
 648 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 649 // load the destination pixels (from a similar 8x16 slice), add and store back.
 650 // x0 = dst
 651 // x1 = dst stride
 652 // x2 = src (temp buffer)
 653 // x3 = slice offset
 654 // x9 = temp buffer stride
 655 function \txfm\()16_1d_8x16_pass2_neon
 656 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 657         load            \i,  x2,  x9
 658 .endr
 659         cbz             x3,  1f
 660 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 661         load            \i,  x2,  x9
 662 .endr
 663 1:
 664
 665         add             x3,  x0,  x1
 666         lsl             x1,  x1,  #1
 667         \txfm\()16
 668
 669 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
 670         srshr           \coef0, \coef0, #6
 671         ld1             {v2.8b},  [x0], x1
 672         srshr           \coef1, \coef1, #6
 673         ld1             {v3.8b},  [x3], x1
 674         srshr           \coef2, \coef2, #6
 675         ld1             {v4.8b},  [x0], x1
 676         srshr           \coef3, \coef3, #6
 677         uaddw           \coef0, \coef0, v2.8b
 678         ld1             {v5.8b},  [x3], x1
 679         uaddw           \coef1, \coef1, v3.8b
 680         srshr           \coef4, \coef4, #6
 681         ld1             {v6.8b},  [x0], x1
 682         srshr           \coef5, \coef5, #6
 683         ld1             {v7.8b},  [x3], x1
 684         sqxtun          v2.8b,  \coef0
 685         srshr           \coef6, \coef6, #6
 686         sqxtun          v3.8b,  \coef1
 687         srshr           \coef7, \coef7, #6
 688         uaddw           \coef2, \coef2, v4.8b
 689         ld1             {\tmp1},  [x0], x1
 690         uaddw           \coef3, \coef3, v5.8b
 691         ld1             {\tmp2},  [x3], x1
 692         sqxtun          v4.8b,  \coef2
 693         sub             x0,  x0,  x1, lsl #2
 694         sub             x3,  x3,  x1, lsl #2
 695         sqxtun          v5.8b,  \coef3
 696         uaddw           \coef4, \coef4, v6.8b
 697         st1             {v2.8b},  [x0], x1
 698         uaddw           \coef5, \coef5, v7.8b
 699         st1             {v3.8b},  [x3], x1
 700         sqxtun          v6.8b,  \coef4
 701         st1             {v4.8b},  [x0], x1
 702         sqxtun          v7.8b,  \coef5
 703         st1             {v5.8b},  [x3], x1
 704         uaddw           \coef6, \coef6, \tmp1
 705         st1             {v6.8b},  [x0], x1
 706         uaddw           \coef7, \coef7, \tmp2
 707         st1             {v7.8b},  [x3], x1
 708         sqxtun          \tmp1,  \coef6
 709         sqxtun          \tmp2,  \coef7
 710         st1             {\tmp1},  [x0], x1
 711         st1             {\tmp2},  [x3], x1
 712 .endm
 713         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
 714         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 715 .purgem load_add_store
 716
 717         ret
 718 endfunc
 719 .endm
 720
 721 itxfm16_1d_funcs idct
 722 itxfm16_1d_funcs iadst
 723
 724 .macro itxfm_func16x16 txfm1, txfm2
 725 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 726 .ifc \txfm1\()_\txfm2,idct_idct
 727         cmp             w3,  #1
 728         b.eq            idct16x16_dc_add_neon
 729 .endif
 730         mov             x15, x30
 731         // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
 732 .ifnc \txfm1\()_\txfm2,idct_idct
 733         stp             d14, d15, [sp, #-0x10]!
 734         stp             d12, d13, [sp, #-0x10]!
 735         stp             d10, d11, [sp, #-0x10]!
 736         stp             d8,  d9,  [sp, #-0x10]!
 737 .endif
 738
 739         sub             sp,  sp,  #512
 740
 741         mov             x4,  x0
 742         mov             x5,  x1
 743         mov             x6,  x2
 744
 745         movrel          x10, idct_coeffs
 746 .ifnc \txfm1\()_\txfm2,idct_idct
 747         movrel          x11, iadst16_coeffs
 748 .endif
 749 .ifc \txfm1,idct
 750         ld1             {v0.8h,v1.8h}, [x10]
 751 .endif
 752         mov             x9, #32
 753
 754 .irp i, 0, 8
 755         add             x0,  sp,  #(\i*32)
 756 .ifc \txfm1\()_\txfm2,idct_idct
 757 .if \i == 8
 758         cmp             w3,  #38
 759         b.le            1f
 760 .endif
 761 .endif
 762         mov             x1,  #\i
 763         add             x2,  x6,  #(\i*2)
 764         bl              \txfm1\()16_1d_8x16_pass1_neon
 765 .endr
 766 .ifc \txfm1\()_\txfm2,iadst_idct
 767         ld1             {v0.8h,v1.8h}, [x10]
 768 .endif
 769
 770 .ifc \txfm1\()_\txfm2,idct_idct
 771         b               3f
 772 1:
 773         // Set v24-v31 to zero, for the in-register passthrough of
 774         // coefficients to pass 2. Since we only do two slices, this can
 775         // only ever happen for the second slice. So we only need to store
 776         // zeros to the temp buffer for the second half of the buffer.
 777         // Move x0 to the second half, and use x9 == 32 as increment.
 778         add             x0,  x0,  #16
 779 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
 780         movi_v          \i,  .16b, #0
 781         st1             {v24.8h},  [x0], x9
 782 .endr
 783 3:
 784 .endif
 785
 786 .irp i, 0, 8
 787         add             x0,  x4,  #(\i)
 788         mov             x1,  x5
 789         add             x2,  sp,  #(\i*2)
 790         mov             x3,  #\i
 791         bl              \txfm2\()16_1d_8x16_pass2_neon
 792 .endr
 793
 794         add             sp,  sp,  #512
 795 .ifnc \txfm1\()_\txfm2,idct_idct
 796         ldp             d8,  d9,  [sp], 0x10
 797         ldp             d10, d11, [sp], 0x10
 798         ldp             d12, d13, [sp], 0x10
 799         ldp             d14, d15, [sp], 0x10
 800 .endif
 801         br              x15
 802 endfunc
 803 .endm
 804
 805 itxfm_func16x16 idct,  idct
 806 itxfm_func16x16 iadst, idct
 807 itxfm_func16x16 idct,  iadst
 808 itxfm_func16x16 iadst, iadst
 809
 810
 811 function idct32x32_dc_add_neon
 812         movrel          x4, idct_coeffs
 813         ld1             {v0.4h}, [x4]
 814
 815         movi            v1.4h, #0
 816
 817         ld1r            {v2.4h}, [x2]
 818         smull           v2.4s,  v2.4h,  v0.h[0]
 819         rshrn           v2.4h,  v2.4s,  #14
 820         smull           v2.4s,  v2.4h,  v0.h[0]
 821         rshrn           v2.4h,  v2.4s,  #14
 822         dup             v2.8h,  v2.h[0]
 823         st1             {v1.h}[0], [x2]
 824
 825         srshr           v0.8h, v2.8h, #6
 826
 827         mov             x4, #32
 828 1:
 829         // Loop to add the constant v0 into all 32x32 outputs
 830         ld1             {v1.16b,v2.16b},  [x0]
 831         uaddw           v3.8h,  v0.8h,  v1.8b
 832         uaddw2          v4.8h,  v0.8h,  v1.16b
 833         uaddw           v5.8h,  v0.8h,  v2.8b
 834         uaddw2          v6.8h,  v0.8h,  v2.16b
 835         sqxtun          v3.8b,  v3.8h
 836         sqxtun2         v3.16b, v4.8h
 837         sqxtun          v4.8b,  v5.8h
 838         sqxtun2         v4.16b, v6.8h
 839         st1             {v3.16b,v4.16b},  [x0], x1
 840         subs            x4,  x4,  #1
 841         b.ne            1b
 842
 843         ret
 844 endfunc
 845
 846 .macro idct32_odd
 847         ld1             {v0.8h,v1.8h}, [x11]
 848
 849         dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
 850         dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
 851         dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
 852         dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
 853         dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
 854         dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
 855         dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
 856         dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 857
 858         ld1             {v0.8h}, [x10]
 859
 860         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
 861         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
 862         butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
 863         butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
 864         butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
 865         butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
 866         butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
 867         butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
 868
 869         dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
 870         dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
 871         dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
 872         dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
 873
 874         butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
 875         butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
 876         butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
 877         butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
 878         butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
 879         butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
 880         butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
 881         butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
 882
 883         dmbutterfly     v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
 884         dmbutterfly     v3,  v5,  v0.h[1], v0.h[2], v24, v25, v30, v31        // v3  = t19,  v5  = t28
 885         dmbutterfly     v28, v6,  v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
 886         dmbutterfly     v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
 887
 888         butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
 889         butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
 890         butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
 891         butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
 892         butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
 893         butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
 894         butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
 895         butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
 896
 897         dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
 898         dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
 899         dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
 900         dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
 901 .endm
 902
 903 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 904 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
 905 // a normal IDCT16 with every other input component (the even ones, with
 906 // each output written twice), followed by a separate 16-point IDCT
 907 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
 908 // x0 = dst (temp buffer)
 909 // x1 = unused
 910 // x2 = src
 911 // x9 = double input stride
 912 // x10 = idct_coeffs
 913 // x11 = idct_coeffs + 32
 914 function idct32_1d_8x32_pass1_neon
 915         ld1             {v0.8h,v1.8h}, [x10]
 916
 917         movi            v4.8h, #0
 918
 919         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 920 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 921         ld1             {v\i\().8h}, [x2]
 922         st1             {v4.8h},  [x2], x9
 923 .endr
 924
 925         idct16
 926
 927         // Do two 8x8 transposes. Originally, v16-v31 contain the
 928         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
 929         // two transposed 8x8 blocks.
 930         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 931         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 932
 933         // Store the registers a, b horizontally, followed by the
 934         // same registers b, a mirrored.
 935 .macro store_rev a, b
 936         // There's no rev128 instruction, but we reverse each 64 bit
 937         // half, and then flip them using an ext with 8 bytes offset.
 938         rev64           v1.8h, v\b\().8h
 939         st1             {v\a\().8h},  [x0], #16
 940         rev64           v0.8h, v\a\().8h
 941         ext             v1.16b, v1.16b, v1.16b, #8
 942         st1             {v\b\().8h},  [x0], #16
 943         ext             v0.16b, v0.16b, v0.16b, #8
 944         st1             {v1.8h},  [x0], #16
 945         st1             {v0.8h},  [x0], #16
 946 .endm
 947         store_rev       16, 24
 948         store_rev       17, 25
 949         store_rev       18, 26
 950         store_rev       19, 27
 951         store_rev       20, 28
 952         store_rev       21, 29
 953         store_rev       22, 30
 954         store_rev       23, 31
 955         sub             x0,  x0,  #512
 956 .purgem store_rev
 957
 958         // Move x2 back to the start of the input, and move
 959         // to the first odd row
 960         sub             x2,  x2,  x9, lsl #4
 961         add             x2,  x2,  #64
 962
 963         movi            v4.8h, #0
 964         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
 965 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 966         ld1             {v\i\().8h}, [x2]
 967         st1             {v4.8h},  [x2], x9
 968 .endr
 969
 970         idct32_odd
 971
 972         transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
 973         transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
 974
 975         // Store the registers a, b horizontally,
 976         // adding into the output first, and the mirrored,
 977         // subtracted from the output.
 978 .macro store_rev a, b
 979         ld1             {v4.8h},  [x0]
 980         rev64           v1.8h, v\b\().8h
 981         add             v4.8h, v4.8h, v\a\().8h
 982         rev64           v0.8h, v\a\().8h
 983         st1             {v4.8h},  [x0], #16
 984         ext             v1.16b, v1.16b, v1.16b, #8
 985         ld1             {v5.8h},  [x0]
 986         ext             v0.16b, v0.16b, v0.16b, #8
 987         add             v5.8h, v5.8h, v\b\().8h
 988         st1             {v5.8h},  [x0], #16
 989         ld1             {v6.8h},  [x0]
 990         sub             v6.8h, v6.8h, v1.8h
 991         st1             {v6.8h},  [x0], #16
 992         ld1             {v7.8h},  [x0]
 993         sub             v7.8h, v7.8h, v0.8h
 994         st1             {v7.8h},  [x0], #16
 995 .endm
 996
 997         store_rev       31, 23
 998         store_rev       30, 22
 999         store_rev       29, 21
1000         store_rev       28, 20
1001         store_rev       27, 19
1002         store_rev       26, 18
1003         store_rev       25, 17
1004         store_rev       24, 16
1005 .purgem store_rev
1006         ret
1007 endfunc
1008
1009 // This is mostly the same as 8x32_pass1, but without the transpose,
1010 // and use the source as temp buffer between the two idct passes, and
1011 // add into the destination.
1012 // x0 = dst
1013 // x1 = dst stride
1014 // x2 = src (temp buffer)
1015 // x7 = negative double temp buffer stride
1016 // x9 = double temp buffer stride
1017 // x10 = idct_coeffs
1018 // x11 = idct_coeffs + 32
1019 function idct32_1d_8x32_pass2_neon
1020         ld1             {v0.8h,v1.8h}, [x10]
1021
1022         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024         ld1             {v\i\().8h}, [x2], x9
1025 .endr
1026         sub             x2,  x2,  x9, lsl #4
1027
1028         idct16
1029
1030 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1031         st1             {v\i\().8h}, [x2], x9
1032 .endr
1033
1034         sub             x2,  x2,  x9, lsl #4
1035         add             x2,  x2,  #64
1036
1037         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1038 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1039         ld1             {v\i\().8h}, [x2], x9
1040 .endr
1041         sub             x2,  x2,  x9, lsl #4
1042         sub             x2,  x2,  #64
1043
1044         idct32_odd
1045
1046 .macro load_acc_store a, b, c, d, neg=0
1047 .if \neg == 0
1048         ld1             {v4.8h},  [x2], x9
1049         ld1             {v5.8h},  [x2], x9
1050         add             v4.8h, v4.8h, v\a\().8h
1051         ld1             {v6.8h},  [x2], x9
1052         add             v5.8h, v5.8h, v\b\().8h
1053         ld1             {v7.8h},  [x2], x9
1054         add             v6.8h, v6.8h, v\c\().8h
1055         add             v7.8h, v7.8h, v\d\().8h
1056 .else
1057         ld1             {v4.8h},  [x2], x7
1058         ld1             {v5.8h},  [x2], x7
1059         sub             v4.8h, v4.8h, v\a\().8h
1060         ld1             {v6.8h},  [x2], x7
1061         sub             v5.8h, v5.8h, v\b\().8h
1062         ld1             {v7.8h},  [x2], x7
1063         sub             v6.8h, v6.8h, v\c\().8h
1064         sub             v7.8h, v7.8h, v\d\().8h
1065 .endif
1066         ld1             {v0.8b}, [x0], x1
1067         ld1             {v1.8b}, [x0], x1
1068         srshr           v4.8h, v4.8h, #6
1069         ld1             {v2.8b}, [x0], x1
1070         srshr           v5.8h, v5.8h, #6
1071         uaddw           v4.8h, v4.8h, v0.8b
1072         ld1             {v3.8b}, [x0], x1
1073         srshr           v6.8h, v6.8h, #6
1074         uaddw           v5.8h, v5.8h, v1.8b
1075         srshr           v7.8h, v7.8h, #6
1076         sub             x0,  x0,  x1, lsl #2
1077         uaddw           v6.8h, v6.8h, v2.8b
1078         sqxtun          v4.8b, v4.8h
1079         uaddw           v7.8h, v7.8h, v3.8b
1080         sqxtun          v5.8b, v5.8h
1081         st1             {v4.8b}, [x0], x1
1082         sqxtun          v6.8b, v6.8h
1083         st1             {v5.8b}, [x0], x1
1084         sqxtun          v7.8b, v7.8h
1085         st1             {v6.8b}, [x0], x1
1086         st1             {v7.8b}, [x0], x1
1087 .endm
1088         load_acc_store  31, 30, 29, 28
1089         load_acc_store  27, 26, 25, 24
1090         load_acc_store  23, 22, 21, 20
1091         load_acc_store  19, 18, 17, 16
1092         sub             x2,  x2,  x9
1093         load_acc_store  16, 17, 18, 19, 1
1094         load_acc_store  20, 21, 22, 23, 1
1095         load_acc_store  24, 25, 26, 27, 1
1096         load_acc_store  28, 29, 30, 31, 1
1097 .purgem load_acc_store
1098         ret
1099 endfunc
1100
1101 const min_eob_idct_idct_32, align=4
1102         .short  0, 34, 135, 336
1103 endconst
1104
1105 function ff_vp9_idct_idct_32x32_add_neon, export=1
1106         cmp             w3,  #1
1107         b.eq            idct32x32_dc_add_neon
1108
1109         movrel          x10, idct_coeffs
1110         add             x11, x10, #32
1111         movrel          x12, min_eob_idct_idct_32, 2
1112
1113         mov             x15, x30
1114
1115         stp             d14, d15, [sp, #-0x10]!
1116         stp             d12, d13, [sp, #-0x10]!
1117         stp             d10, d11, [sp, #-0x10]!
1118         stp             d8,  d9,  [sp, #-0x10]!
1119
1120         sub             sp,  sp,  #2048
1121
1122         mov             x4,  x0
1123         mov             x5,  x1
1124         mov             x6,  x2
1125
1126         // Double stride of the input, since we only read every other line
1127         mov             x9,  #128
1128         neg             x7,  x9
1129
1130 .irp i, 0, 8, 16, 24
1131         add             x0,  sp,  #(\i*64)
1132 .if \i > 0
1133         ldrh            w1,  [x12], #2
1134         cmp             w3,  w1
1135         mov             x1,  #(32 - \i)/4
1136         b.le            1f
1137 .endif
1138         add             x2,  x6,  #(\i*2)
1139         bl              idct32_1d_8x32_pass1_neon
1140 .endr
1141         b               3f
1142
1143 1:
1144         // Write zeros to the temp buffer for pass 2
1145         movi            v16.8h,  #0
1146         movi            v17.8h,  #0
1147         movi            v18.8h,  #0
1148         movi            v19.8h,  #0
1149 2:
1150         subs            x1,  x1,  #1
1151 .rept 4
1152         st1             {v16.8h-v19.8h},  [x0], #64
1153 .endr
1154         b.ne            2b
1155 3:
1156 .irp i, 0, 8, 16, 24
1157         add             x0,  x4,  #(\i)
1158         mov             x1,  x5
1159         add             x2,  sp,  #(\i*2)
1160         bl              idct32_1d_8x32_pass2_neon
1161 .endr
1162
1163         add             sp,  sp,  #2048
1164
1165         ldp             d8,  d9,  [sp], 0x10
1166         ldp             d10, d11, [sp], 0x10
1167         ldp             d12, d13, [sp], 0x10
1168         ldp             d14, d15, [sp], 0x10
1169
1170         br              x15
1171 endfunc