git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_16bpp_neon.S

   1 /*
   2  * Copyright (c) 2017 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22 #include "neon.S"
  23
  24 const itxfm4_coeffs, align=4
  25         .short  11585, 0, 6270, 15137
  26 iadst4_coeffs:
  27         .short  5283, 15212, 9929, 13377
  28 endconst
  29
  30 const iadst8_coeffs, align=4
  31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
  32 idct_coeffs:
  33         .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
  34         .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
  35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
  36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
  37 endconst
  38
  39 const iadst16_coeffs, align=4
  40         .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
  41         .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
  42 endconst
  43
  44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
  45         trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
  46         trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
  47         trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
  48         trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
  49         trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
  50         trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
  51         trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
  52         trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
  53 .endm
  54
  55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
  56 // over two registers.
  57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
  58         transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
  59         transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
  60
  61         // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
  62         // while swapping the two 4x4 matrices between each other
  63
  64         // First step of the 4x4 transpose of r1-r7, into t0-t3
  65         trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
  66         trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
  67         trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
  68         trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
  69
  70         // First step of the 4x4 transpose of r8-r12, into r1-r7
  71         trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
  72         trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
  73         trn1            \r5\().4s,  \r12\().4s, \r14\().4s
  74         trn2            \r7\().4s,  \r12\().4s, \r14\().4s
  75
  76         // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
  77         trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
  78         trn2            \r12\().2d, \t0\().2d,  \t2\().2d
  79         trn1            \r10\().2d, \t1\().2d,  \t3\().2d
  80         trn2            \r14\().2d, \t1\().2d,  \t3\().2d
  81
  82         // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
  83         trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
  84         trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
  85         trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
  86         trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
  87
  88         // Move the outputs of trn1 back in place
  89         mov             \r1\().16b,  \t0\().16b
  90         mov             \r3\().16b,  \t1\().16b
  91 .endm
  92
  93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
  94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
  95 // in/out are .4s registers; this can do with 4 temp registers, but is
  96 // more efficient if 6 temp registers are available.
  97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
  98 .if \neg > 0
  99         neg             \tmp4\().4s, v0.4s
 100 .endif
 101         add             \tmp1\().4s, \in1\().4s,  \in2\().4s
 102         sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
 103 .if \neg > 0
 104         smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
 105         smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
 106 .else
 107         smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
 108         smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
 109 .endif
 110 .ifb \tmp5
 111         rshrn           \out1\().2s, \tmp3\().2d, #14
 112         rshrn2          \out1\().4s, \tmp4\().2d, #14
 113         smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
 114         smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
 115         rshrn           \out2\().2s, \tmp3\().2d, #14
 116         rshrn2          \out2\().4s, \tmp4\().2d, #14
 117 .else
 118         smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
 119         smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
 120         rshrn           \out1\().2s, \tmp3\().2d, #14
 121         rshrn2          \out1\().4s, \tmp4\().2d, #14
 122         rshrn           \out2\().2s, \tmp5\().2d, #14
 123         rshrn2          \out2\().4s, \tmp6\().2d, #14
 124 .endif
 125 .endm
 126
 127 // out1,out2 = in1 * coef1 - in2 * coef2
 128 // out3,out4 = in1 * coef2 + in2 * coef1
 129 // out are 4 x .2d registers, in are 2 x .4s registers
 130 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
 131         smull           \out1\().2d, \in1\().2s, \coef1
 132         smull2          \out2\().2d, \in1\().4s, \coef1
 133         smull           \out3\().2d, \in1\().2s, \coef2
 134         smull2          \out4\().2d, \in1\().4s, \coef2
 135         smlsl           \out1\().2d, \in2\().2s, \coef2
 136         smlsl2          \out2\().2d, \in2\().4s, \coef2
 137         smlal           \out3\().2d, \in2\().2s, \coef1
 138         smlal2          \out4\().2d, \in2\().4s, \coef1
 139 .endm
 140
 141 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
 142 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
 143 // inout are 2 x .4s registers
 144 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
 145         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
 146 .if \neg > 0
 147         neg             \tmp3\().2d, \tmp3\().2d
 148         neg             \tmp4\().2d, \tmp4\().2d
 149 .endif
 150         rshrn           \inout1\().2s, \tmp1\().2d,  #14
 151         rshrn2          \inout1\().4s, \tmp2\().2d,  #14
 152         rshrn           \inout2\().2s, \tmp3\().2d,  #14
 153         rshrn2          \inout2\().4s, \tmp4\().2d,  #14
 154 .endm
 155
 156 // out1 = in1 + in2
 157 // out2 = in1 - in2
 158 .macro butterfly_4s out1, out2, in1, in2
 159         add             \out1\().4s, \in1\().4s, \in2\().4s
 160         sub             \out2\().4s, \in1\().4s, \in2\().4s
 161 .endm
 162
 163 // out1 = in1 - in2
 164 // out2 = in1 + in2
 165 .macro butterfly_4s_r out1, out2, in1, in2
 166         sub             \out1\().4s, \in1\().4s, \in2\().4s
 167         add             \out2\().4s, \in1\().4s, \in2\().4s
 168 .endm
 169
 170 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
 171 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
 172 // out are 2 x .4s registers, in are 4 x .2d registers
 173 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
 174         add             \tmp1\().2d, \in1\().2d, \in3\().2d
 175         add             \tmp2\().2d, \in2\().2d, \in4\().2d
 176         sub             \tmp3\().2d, \in1\().2d, \in3\().2d
 177         sub             \tmp4\().2d, \in2\().2d, \in4\().2d
 178         rshrn           \out1\().2s, \tmp1\().2d,  #14
 179         rshrn2          \out1\().4s, \tmp2\().2d,  #14
 180         rshrn           \out2\().2s, \tmp3\().2d,  #14
 181         rshrn2          \out2\().4s, \tmp4\().2d,  #14
 182 .endm
 183
 184 .macro iwht4_10 c0, c1, c2, c3
 185         add             \c0\().4s, \c0\().4s, \c1\().4s
 186         sub             v17.4s,    \c2\().4s, \c3\().4s
 187         sub             v16.4s,    \c0\().4s, v17.4s
 188         sshr            v16.4s,    v16.4s,    #1
 189         sub             \c2\().4s, v16.4s,    \c1\().4s
 190         sub             \c1\().4s, v16.4s,    \c3\().4s
 191         add             \c3\().4s, v17.4s,    \c2\().4s
 192         sub             \c0\().4s, \c0\().4s, \c1\().4s
 193 .endm
 194
 195 .macro iwht4_12 c0, c1, c2, c3
 196         iwht4_10        \c0, \c1, \c2, \c3
 197 .endm
 198
 199 .macro idct4_10 c0, c1, c2, c3
 200         mul             v22.4s,    \c1\().4s, v0.s[3]
 201         mul             v20.4s,    \c1\().4s, v0.s[2]
 202         add             v16.4s,    \c0\().4s, \c2\().4s
 203         sub             v17.4s,    \c0\().4s, \c2\().4s
 204         mla             v22.4s,    \c3\().4s, v0.s[2]
 205         mul             v18.4s,    v16.4s,    v0.s[0]
 206         mul             v24.4s,    v17.4s,    v0.s[0]
 207         mls             v20.4s,    \c3\().4s, v0.s[3]
 208         srshr           v22.4s,    v22.4s,    #14
 209         srshr           v18.4s,    v18.4s,    #14
 210         srshr           v24.4s,    v24.4s,    #14
 211         srshr           v20.4s,    v20.4s,    #14
 212         add             \c0\().4s, v18.4s,    v22.4s
 213         sub             \c3\().4s, v18.4s,    v22.4s
 214         add             \c1\().4s, v24.4s,    v20.4s
 215         sub             \c2\().4s, v24.4s,    v20.4s
 216 .endm
 217
 218 .macro idct4_12 c0, c1, c2, c3
 219         smull           v22.2d,    \c1\().2s, v0.s[3]
 220         smull2          v23.2d,    \c1\().4s, v0.s[3]
 221         smull           v20.2d,    \c1\().2s, v0.s[2]
 222         smull2          v21.2d,    \c1\().4s, v0.s[2]
 223         add             v16.4s,    \c0\().4s, \c2\().4s
 224         sub             v17.4s,    \c0\().4s, \c2\().4s
 225         smlal           v22.2d,    \c3\().2s, v0.s[2]
 226         smlal2          v23.2d,    \c3\().4s, v0.s[2]
 227         smull           v18.2d,    v16.2s,    v0.s[0]
 228         smull2          v19.2d,    v16.4s,    v0.s[0]
 229         smull           v24.2d,    v17.2s,    v0.s[0]
 230         smull2          v25.2d,    v17.4s,    v0.s[0]
 231         smlsl           v20.2d,    \c3\().2s, v0.s[3]
 232         smlsl2          v21.2d,    \c3\().4s, v0.s[3]
 233         rshrn           v22.2s,    v22.2d,    #14
 234         rshrn2          v22.4s,    v23.2d,    #14
 235         rshrn           v18.2s,    v18.2d,    #14
 236         rshrn2          v18.4s,    v19.2d,    #14
 237         rshrn           v24.2s,    v24.2d,    #14
 238         rshrn2          v24.4s,    v25.2d,    #14
 239         rshrn           v20.2s,    v20.2d,    #14
 240         rshrn2          v20.4s,    v21.2d,    #14
 241         add             \c0\().4s, v18.4s,    v22.4s
 242         sub             \c3\().4s, v18.4s,    v22.4s
 243         add             \c1\().4s, v24.4s,    v20.4s
 244         sub             \c2\().4s, v24.4s,    v20.4s
 245 .endm
 246
 247 .macro iadst4_10 c0, c1, c2, c3
 248         mul             v16.4s,    \c0\().4s, v1.s[0]
 249         mla             v16.4s,    \c2\().4s, v1.s[1]
 250         mla             v16.4s,    \c3\().4s, v1.s[2]
 251         mul             v18.4s,    \c0\().4s, v1.s[2]
 252         mls             v18.4s,    \c2\().4s, v1.s[0]
 253         sub             \c0\().4s, \c0\().4s, \c2\().4s
 254         mls             v18.4s,    \c3\().4s, v1.s[1]
 255         add             \c0\().4s, \c0\().4s, \c3\().4s
 256         mul             v22.4s,    \c1\().4s, v1.s[3]
 257         mul             v20.4s,    \c0\().4s, v1.s[3]
 258         add             v24.4s,    v16.4s,    v22.4s
 259         add             v26.4s,    v18.4s,    v22.4s
 260         srshr           \c0\().4s, v24.4s,    #14
 261         add             v16.4s,    v16.4s,    v18.4s
 262         srshr           \c1\().4s, v26.4s,    #14
 263         sub             v16.4s,    v16.4s,    v22.4s
 264         srshr           \c2\().4s, v20.4s,    #14
 265         srshr           \c3\().4s, v16.4s,    #14
 266 .endm
 267
 268 .macro iadst4_12 c0, c1, c2, c3
 269         smull           v16.2d,    \c0\().2s, v1.s[0]
 270         smull2          v17.2d,    \c0\().4s, v1.s[0]
 271         smlal           v16.2d,    \c2\().2s, v1.s[1]
 272         smlal2          v17.2d,    \c2\().4s, v1.s[1]
 273         smlal           v16.2d,    \c3\().2s, v1.s[2]
 274         smlal2          v17.2d,    \c3\().4s, v1.s[2]
 275         smull           v18.2d,    \c0\().2s, v1.s[2]
 276         smull2          v19.2d,    \c0\().4s, v1.s[2]
 277         smlsl           v18.2d,    \c2\().2s, v1.s[0]
 278         smlsl2          v19.2d,    \c2\().4s, v1.s[0]
 279         sub             \c0\().4s, \c0\().4s, \c2\().4s
 280         smlsl           v18.2d,    \c3\().2s, v1.s[1]
 281         smlsl2          v19.2d,    \c3\().4s, v1.s[1]
 282         add             \c0\().4s, \c0\().4s, \c3\().4s
 283         smull           v22.2d,    \c1\().2s, v1.s[3]
 284         smull2          v23.2d,    \c1\().4s, v1.s[3]
 285         smull           v20.2d,    \c0\().2s, v1.s[3]
 286         smull2          v21.2d,    \c0\().4s, v1.s[3]
 287         add             v24.2d,    v16.2d,    v22.2d
 288         add             v25.2d,    v17.2d,    v23.2d
 289         add             v26.2d,    v18.2d,    v22.2d
 290         add             v27.2d,    v19.2d,    v23.2d
 291         rshrn           \c0\().2s, v24.2d,    #14
 292         rshrn2          \c0\().4s, v25.2d,    #14
 293         add             v16.2d,    v16.2d,    v18.2d
 294         add             v17.2d,    v17.2d,    v19.2d
 295         rshrn           \c1\().2s, v26.2d,    #14
 296         rshrn2          \c1\().4s, v27.2d,    #14
 297         sub             v16.2d,    v16.2d,    v22.2d
 298         sub             v17.2d,    v17.2d,    v23.2d
 299         rshrn           \c2\().2s, v20.2d,    #14
 300         rshrn2          \c2\().4s, v21.2d,    #14
 301         rshrn           \c3\().2s, v16.2d,    #14
 302         rshrn2          \c3\().4s, v17.2d,    #14
 303 .endm
 304
 305 // The public functions in this file have got the following signature:
 306 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 307
 308 .macro itxfm_func4x4 txfm1, txfm2, bpp
 309 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
 310 .ifc \txfm1,\txfm2
 311 .ifc \txfm1,idct
 312         movrel          x4,  itxfm4_coeffs
 313         ld1             {v0.4h}, [x4]
 314         sxtl            v0.4s,  v0.4h
 315 .endif
 316 .ifc \txfm1,iadst
 317         movrel          x4,  iadst4_coeffs
 318         ld1             {v0.d}[1], [x4]
 319         sxtl2           v1.4s,  v0.8h
 320 .endif
 321 .else
 322         movrel          x4,  itxfm4_coeffs
 323         ld1             {v0.8h}, [x4]
 324         sxtl2           v1.4s,  v0.8h
 325         sxtl            v0.4s,  v0.4h
 326 .endif
 327
 328         movi            v30.4s, #0
 329         movi            v31.4s, #0
 330 .ifc \txfm1\()_\txfm2,idct_idct
 331         cmp             w3,  #1
 332         b.ne            1f
 333         // DC-only for idct/idct
 334         ld1             {v2.s}[0],  [x2]
 335         smull           v2.2d,  v2.2s, v0.s[0]
 336         rshrn           v2.2s,  v2.2d, #14
 337         smull           v2.2d,  v2.2s, v0.s[0]
 338         rshrn           v2.2s,  v2.2d, #14
 339         st1             {v31.s}[0], [x2]
 340         dup             v4.4s,  v2.s[0]
 341         mov             v5.16b, v4.16b
 342         mov             v6.16b, v4.16b
 343         mov             v7.16b, v4.16b
 344         b               2f
 345 .endif
 346
 347 1:
 348         ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
 349         st1             {v30.4s,v31.4s}, [x2], #32
 350
 351 .ifc \txfm1,iwht
 352         sshr            v4.4s,  v4.4s,  #2
 353         sshr            v5.4s,  v5.4s,  #2
 354         sshr            v6.4s,  v6.4s,  #2
 355         sshr            v7.4s,  v7.4s,  #2
 356 .endif
 357
 358         \txfm1\()4_\bpp v4,  v5,  v6,  v7
 359
 360         st1             {v30.4s,v31.4s}, [x2], #32
 361         // Transpose 4x4 with 32 bit elements
 362         transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 363
 364         \txfm2\()4_\bpp v4,  v5,  v6,  v7
 365 2:
 366         mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
 367         ld1             {v0.4h},   [x0], x1
 368         ld1             {v1.4h},   [x0], x1
 369 .ifnc \txfm1,iwht
 370         srshr           v4.4s,  v4.4s,  #4
 371         srshr           v5.4s,  v5.4s,  #4
 372         srshr           v6.4s,  v6.4s,  #4
 373         srshr           v7.4s,  v7.4s,  #4
 374 .endif
 375         uaddw           v4.4s,  v4.4s,  v0.4h
 376         uaddw           v5.4s,  v5.4s,  v1.4h
 377         ld1             {v2.4h},   [x0], x1
 378         ld1             {v3.4h},   [x0], x1
 379         sqxtun          v0.4h,  v4.4s
 380         sqxtun2         v0.8h,  v5.4s
 381         sub             x0,  x0,  x1, lsl #2
 382
 383         uaddw           v6.4s,  v6.4s,  v2.4h
 384         umin            v0.8h,  v0.8h,  v31.8h
 385         uaddw           v7.4s,  v7.4s,  v3.4h
 386         st1             {v0.4h},   [x0], x1
 387         sqxtun          v2.4h,  v6.4s
 388         sqxtun2         v2.8h,  v7.4s
 389         umin            v2.8h,  v2.8h,  v31.8h
 390
 391         st1             {v0.d}[1], [x0], x1
 392         st1             {v2.4h},   [x0], x1
 393         st1             {v2.d}[1], [x0], x1
 394
 395         ret
 396 endfunc
 397 .endm
 398
 399 .macro itxfm_funcs4x4 bpp
 400 itxfm_func4x4 idct,  idct,  \bpp
 401 itxfm_func4x4 iadst, idct,  \bpp
 402 itxfm_func4x4 idct,  iadst, \bpp
 403 itxfm_func4x4 iadst, iadst, \bpp
 404 itxfm_func4x4 iwht,  iwht,  \bpp
 405 .endm
 406
 407 itxfm_funcs4x4 10
 408 itxfm_funcs4x4 12
 409
 410 function idct8x8_dc_add_neon
 411         movrel          x4,  idct_coeffs
 412         ld1             {v0.4h}, [x4]
 413
 414         movi            v1.4h,  #0
 415         sxtl            v0.4s,  v0.4h
 416
 417         ld1             {v2.s}[0],  [x2]
 418         smull           v2.2d,  v2.2s,  v0.s[0]
 419         rshrn           v2.2s,  v2.2d,  #14
 420         smull           v2.2d,  v2.2s,  v0.s[0]
 421         rshrn           v2.2s,  v2.2d,  #14
 422         st1             {v1.s}[0],  [x2]
 423         dup             v2.4s,  v2.s[0]
 424
 425         srshr           v2.4s,  v2.4s,  #5
 426
 427         mov             x4,  #8
 428         mov             x3,  x0
 429         dup             v31.8h, w5
 430 1:
 431         // Loop to add the constant from v2 into all 8x8 outputs
 432         subs            x4,  x4,  #2
 433         ld1             {v3.8h},  [x0], x1
 434         ld1             {v4.8h},  [x0], x1
 435         uaddw           v16.4s, v2.4s,  v3.4h
 436         uaddw2          v17.4s, v2.4s,  v3.8h
 437         uaddw           v18.4s, v2.4s,  v4.4h
 438         uaddw2          v19.4s, v2.4s,  v4.8h
 439         sqxtun          v3.4h,  v16.4s
 440         sqxtun2         v3.8h,  v17.4s
 441         sqxtun          v4.4h,  v18.4s
 442         sqxtun2         v4.8h,  v19.4s
 443         umin            v3.8h,  v3.8h,  v31.8h
 444         umin            v4.8h,  v4.8h,  v31.8h
 445         st1             {v3.8h},  [x3], x1
 446         st1             {v4.8h},  [x3], x1
 447         b.ne            1b
 448
 449         ret
 450 endfunc
 451
 452 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
 453         dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
 454         dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
 455         dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
 456         dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
 457
 458         butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
 459         butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
 460         butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
 461         butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
 462
 463         dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
 464
 465         butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
 466         butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
 467         butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
 468         butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
 469 .endm
 470
 471 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
 472         dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
 473         dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
 474
 475         dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
 476         dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
 477
 478         dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
 479         dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
 480
 481         dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
 482         dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
 483
 484         butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
 485         neg             \r7\().4s, \r7\().4s // r7 = out[7]
 486         butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
 487
 488         dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
 489         dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
 490
 491         dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
 492
 493         dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
 494         neg             \r3\().4s, \r3\().4s  // r3 = out[3]
 495
 496         dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
 497         neg             \r1\().4s, \r1\().4s  // r1 = out[1]
 498
 499         dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
 500         neg             \r5\().4s, \r5\().4s  // r5 = out[5]
 501 .endm
 502
 503
 504 .macro itxfm_func8x8 txfm1, txfm2
 505 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
 506 .ifc \txfm1\()_\txfm2,idct_idct
 507         cmp             w3,  #1
 508         b.eq            idct8x8_dc_add_neon
 509 .endif
 510         // The iadst also uses a few coefficients from
 511         // idct, so those always need to be loaded.
 512 .ifc \txfm1\()_\txfm2,idct_idct
 513         movrel          x4,  idct_coeffs
 514 .else
 515         movrel          x4,  iadst8_coeffs
 516         ld1             {v1.8h}, [x4], #16
 517         stp             d8,  d9,  [sp, #-0x10]!
 518         sxtl2           v3.4s,  v1.8h
 519         sxtl            v2.4s,  v1.4h
 520 .endif
 521         ld1             {v0.8h}, [x4]
 522         sxtl2           v1.4s,  v0.8h
 523         sxtl            v0.4s,  v0.4h
 524
 525         movi            v4.4s, #0
 526         movi            v5.4s, #0
 527         movi            v6.4s, #0
 528         movi            v7.4s, #0
 529
 530 1:
 531         ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
 532         ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
 533         ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
 534         ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
 535         sub             x2,  x2,  #256
 536         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 537         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 538         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 539         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 540
 541 .ifc \txfm1\()_\txfm2,idct_idct
 542         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
 543         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
 544 .else
 545         \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
 546         \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
 547 .endif
 548
 549         // Transpose 8x8 with 16 bit elements
 550         transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
 551
 552 .ifc \txfm1\()_\txfm2,idct_idct
 553         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
 554         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
 555 .else
 556         \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
 557         \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
 558 .endif
 559 2:
 560         mov             x3,  x0
 561         // Add into the destination
 562         ld1             {v0.8h},  [x0], x1
 563         srshr           v16.4s, v16.4s, #5
 564         srshr           v17.4s, v17.4s, #5
 565         ld1             {v1.8h},  [x0], x1
 566         srshr           v18.4s, v18.4s, #5
 567         srshr           v19.4s, v19.4s, #5
 568         ld1             {v2.8h},  [x0], x1
 569         srshr           v20.4s, v20.4s, #5
 570         srshr           v21.4s, v21.4s, #5
 571         uaddw           v16.4s, v16.4s, v0.4h
 572         uaddw2          v17.4s, v17.4s, v0.8h
 573         ld1             {v3.8h},  [x0], x1
 574         srshr           v22.4s, v22.4s, #5
 575         srshr           v23.4s, v23.4s, #5
 576         uaddw           v18.4s, v18.4s, v1.4h
 577         uaddw2          v19.4s, v19.4s, v1.8h
 578         ld1             {v4.8h},  [x0], x1
 579         srshr           v24.4s, v24.4s, #5
 580         srshr           v25.4s, v25.4s, #5
 581         uaddw           v20.4s, v20.4s, v2.4h
 582         uaddw2          v21.4s, v21.4s, v2.8h
 583         sqxtun          v0.4h,  v16.4s
 584         sqxtun2         v0.8h,  v17.4s
 585         dup             v16.8h, w5
 586         ld1             {v5.8h},  [x0], x1
 587         srshr           v26.4s, v26.4s, #5
 588         srshr           v27.4s, v27.4s, #5
 589         uaddw           v22.4s, v22.4s, v3.4h
 590         uaddw2          v23.4s, v23.4s, v3.8h
 591         sqxtun          v1.4h,  v18.4s
 592         sqxtun2         v1.8h,  v19.4s
 593         umin            v0.8h,  v0.8h,  v16.8h
 594         ld1             {v6.8h},  [x0], x1
 595         srshr           v28.4s, v28.4s, #5
 596         srshr           v29.4s, v29.4s, #5
 597         uaddw           v24.4s, v24.4s, v4.4h
 598         uaddw2          v25.4s, v25.4s, v4.8h
 599         sqxtun          v2.4h,  v20.4s
 600         sqxtun2         v2.8h,  v21.4s
 601         umin            v1.8h,  v1.8h,  v16.8h
 602         ld1             {v7.8h},  [x0], x1
 603         srshr           v30.4s, v30.4s, #5
 604         srshr           v31.4s, v31.4s, #5
 605         uaddw           v26.4s, v26.4s, v5.4h
 606         uaddw2          v27.4s, v27.4s, v5.8h
 607         sqxtun          v3.4h,  v22.4s
 608         sqxtun2         v3.8h,  v23.4s
 609         umin            v2.8h,  v2.8h,  v16.8h
 610
 611         st1             {v0.8h},  [x3], x1
 612         uaddw           v28.4s, v28.4s, v6.4h
 613         uaddw2          v29.4s, v29.4s, v6.8h
 614         st1             {v1.8h},  [x3], x1
 615         sqxtun          v4.4h,  v24.4s
 616         sqxtun2         v4.8h,  v25.4s
 617         umin            v3.8h,  v3.8h,  v16.8h
 618         st1             {v2.8h},  [x3], x1
 619         uaddw           v30.4s, v30.4s, v7.4h
 620         uaddw2          v31.4s, v31.4s, v7.8h
 621         st1             {v3.8h},  [x3], x1
 622         sqxtun          v5.4h,  v26.4s
 623         sqxtun2         v5.8h,  v27.4s
 624         umin            v4.8h,  v4.8h,  v16.8h
 625         st1             {v4.8h},  [x3], x1
 626         sqxtun          v6.4h,  v28.4s
 627         sqxtun2         v6.8h,  v29.4s
 628         umin            v5.8h,  v5.8h,  v16.8h
 629         st1             {v5.8h},  [x3], x1
 630         sqxtun          v7.4h,  v30.4s
 631         sqxtun2         v7.8h,  v31.4s
 632         umin            v6.8h,  v6.8h,  v16.8h
 633
 634         st1             {v6.8h},  [x3], x1
 635         umin            v7.8h,  v7.8h,  v16.8h
 636         st1             {v7.8h},  [x3], x1
 637
 638 .ifnc \txfm1\()_\txfm2,idct_idct
 639         ldp             d8,  d9,  [sp], 0x10
 640 .endif
 641         ret
 642 endfunc
 643
 644 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
 645         mov             x5,  #0x03ff
 646         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
 647 endfunc
 648
 649 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
 650         mov             x5,  #0x0fff
 651         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
 652 endfunc
 653 .endm
 654
 655 itxfm_func8x8 idct,  idct
 656 itxfm_func8x8 iadst, idct
 657 itxfm_func8x8 idct,  iadst
 658 itxfm_func8x8 iadst, iadst
 659
 660
 661 function idct16x16_dc_add_neon
 662         movrel          x4,  idct_coeffs
 663         ld1             {v0.4h}, [x4]
 664         sxtl            v0.4s,  v0.4h
 665
 666         movi            v1.4h,  #0
 667
 668         ld1             {v2.s}[0],  [x2]
 669         smull           v2.2d,  v2.2s,  v0.s[0]
 670         rshrn           v2.2s,  v2.2d,  #14
 671         smull           v2.2d,  v2.2s,  v0.s[0]
 672         rshrn           v2.2s,  v2.2d,  #14
 673         st1             {v1.s}[0],  [x2]
 674         dup             v2.4s,  v2.s[0]
 675
 676         srshr           v0.4s,  v2.4s,  #6
 677
 678         mov             x3, x0
 679         mov             x4, #16
 680         dup             v31.8h, w13
 681 1:
 682         // Loop to add the constant from v2 into all 16x16 outputs
 683         subs            x4,  x4,  #2
 684         ld1             {v1.8h,v2.8h},  [x0], x1
 685         uaddw           v16.4s, v0.4s,  v1.4h
 686         uaddw2          v17.4s, v0.4s,  v1.8h
 687         ld1             {v3.8h,v4.8h},  [x0], x1
 688         uaddw           v18.4s, v0.4s,  v2.4h
 689         uaddw2          v19.4s, v0.4s,  v2.8h
 690         uaddw           v20.4s, v0.4s,  v3.4h
 691         uaddw2          v21.4s, v0.4s,  v3.8h
 692         uaddw           v22.4s, v0.4s,  v4.4h
 693         uaddw2          v23.4s, v0.4s,  v4.8h
 694         sqxtun          v1.4h,  v16.4s
 695         sqxtun2         v1.8h,  v17.4s
 696         sqxtun          v2.4h,  v18.4s
 697         sqxtun2         v2.8h,  v19.4s
 698         sqxtun          v3.4h,  v20.4s
 699         sqxtun2         v3.8h,  v21.4s
 700         sqxtun          v4.4h,  v22.4s
 701         sqxtun2         v4.8h,  v23.4s
 702         umin            v1.8h,  v1.8h,  v31.8h
 703         umin            v2.8h,  v2.8h,  v31.8h
 704         st1             {v1.8h,v2.8h},  [x3], x1
 705         umin            v3.8h,  v3.8h,  v31.8h
 706         umin            v4.8h,  v4.8h,  v31.8h
 707         st1             {v3.8h,v4.8h},  [x3], x1
 708         b.ne            1b
 709
 710         ret
 711 endfunc
 712
 713 .macro idct16
 714         dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
 715         dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
 716         dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
 717         dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
 718         dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
 719         dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
 720         dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
 721         dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
 722
 723         butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 724         butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 725         butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 726         butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 727         butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 728         butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 729         butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 730         butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 731
 732         dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 733         dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 734         dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 735
 736         butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
 737         butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
 738         butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
 739         butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
 740         butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
 741         butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
 742         butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
 743         butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
 744
 745         dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
 746         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
 747
 748         butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
 749         butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
 750         butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
 751         butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
 752         butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
 753         butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
 754         butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
 755         butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
 756 .endm
 757
 758 .macro iadst16
 759         ld1             {v0.8h,v1.8h}, [x11]
 760         sxtl            v2.4s,  v1.4h
 761         sxtl2           v3.4s,  v1.8h
 762         sxtl2           v1.4s,  v0.8h
 763         sxtl            v0.4s,  v0.4h
 764
 765         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
 766         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
 767         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
 768         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
 769         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
 770
 771         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
 772         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
 773         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
 774         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
 775
 776         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
 777         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
 778         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
 779         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
 780
 781         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
 782         ld1             {v0.8h}, [x10]
 783         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
 784         sxtl2           v1.4s,  v0.8h
 785         sxtl            v0.4s,  v0.4h
 786         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
 787         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
 788
 789         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
 790         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
 791         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
 792         butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
 793         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
 794
 795         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
 796         butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
 797         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
 798         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
 799
 800         butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
 801         butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
 802
 803         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
 804         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
 805
 806         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
 807         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
 808         neg             v29.4s, v29.4s                   // v29 = out[13]
 809
 810         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
 811         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
 812
 813         butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
 814         butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
 815
 816         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
 817         neg             v19.4s, v19.4s                   // v19 = out[3]
 818         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
 819
 820         butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
 821         butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
 822
 823         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
 824         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
 825         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
 826         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
 827
 828         neg             v31.4s,  v5.4s                    // v31 = out[15]
 829         neg             v17.4s,  v3.4s                    // v17 = out[1]
 830
 831         mov             v16.16b, v2.16b
 832         mov             v30.16b, v4.16b
 833 .endm
 834
 835 // Helper macros; we can't use these expressions directly within
 836 // e.g. .irp due to the extra concatenation \(). Therefore wrap
 837 // them in macros to allow using .irp below.
 838 .macro load i, src, inc
 839         ld1             {v\i\().4s},  [\src], \inc
 840 .endm
 841 .macro store i, dst, inc
 842         st1             {v\i\().4s},  [\dst], \inc
 843 .endm
 844 .macro movi_v i, size, imm
 845         movi            v\i\()\size,  \imm
 846 .endm
 847 .macro load_clear i, src, inc
 848         ld1             {v\i\().4s}, [\src]
 849         st1             {v4.4s},  [\src], \inc
 850 .endm
 851
 852 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 853 // transpose into a horizontal 16x4 slice and store.
 854 // x0 = dst (temp buffer)
 855 // x1 = slice offset
 856 // x2 = src
 857 // x9 = input stride
 858 .macro itxfm16_1d_funcs txfm
 859 function \txfm\()16_1d_4x16_pass1_neon
 860         movi            v4.4s, #0
 861 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 862         load_clear      \i,  x2,  x9
 863 .endr
 864
 865         \txfm\()16
 866
 867         // Do four 4x4 transposes. Originally, v16-v31 contain the
 868         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
 869         // contain the four transposed 4x4 blocks.
 870         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
 871         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
 872         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
 873         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
 874
 875         // Store the transposed 8x8 blocks horizontally.
 876         cmp             x1,  #12
 877         b.eq            1f
 878 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
 879         store           \i,  x0,  #16
 880 .endr
 881         ret
 882 1:
 883         // Special case: For the last input column (x1 == 12),
 884         // which would be stored as the last row in the temp buffer,
 885         // don't store the first 4x4 block, but keep it in registers
 886         // for the first slice of the second pass (where it is the
 887         // last 4x4 block).
 888         add             x0,  x0,  #16
 889 .irp i, 20, 24, 28
 890         store           \i,  x0,  #16
 891 .endr
 892         add             x0,  x0,  #16
 893 .irp i, 21, 25, 29
 894         store           \i,  x0,  #16
 895 .endr
 896         add             x0,  x0,  #16
 897 .irp i, 22, 26, 30
 898         store           \i,  x0,  #16
 899 .endr
 900         add             x0,  x0,  #16
 901 .irp i, 23, 27, 31
 902         store           \i,  x0,  #16
 903 .endr
 904
 905         mov             v28.16b, v16.16b
 906         mov             v29.16b, v17.16b
 907         mov             v30.16b, v18.16b
 908         mov             v31.16b, v19.16b
 909         ret
 910 endfunc
 911
 912 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 913 // load the destination pixels (from a similar 4x16 slice), add and store back.
 914 // x0 = dst
 915 // x1 = dst stride
 916 // x2 = src (temp buffer)
 917 // x3 = slice offset
 918 // x9 = temp buffer stride
 919 function \txfm\()16_1d_4x16_pass2_neon
 920 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
 921         load            \i,  x2,  x9
 922 .endr
 923         cbz             x3,  1f
 924 .irp i, 28, 29, 30, 31
 925         load            \i,  x2,  x9
 926 .endr
 927 1:
 928
 929         add             x3,  x0,  x1
 930         lsl             x1,  x1,  #1
 931         \txfm\()16
 932
 933         dup             v8.8h, w13
 934 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
 935         srshr           \coef0, \coef0, #6
 936         ld1             {v4.4h},   [x0], x1
 937         srshr           \coef1, \coef1, #6
 938         ld1             {v4.d}[1], [x3], x1
 939         srshr           \coef2, \coef2, #6
 940         ld1             {v5.4h},   [x0], x1
 941         srshr           \coef3, \coef3, #6
 942         uaddw           \coef0, \coef0, v4.4h
 943         ld1             {v5.d}[1], [x3], x1
 944         srshr           \coef4, \coef4, #6
 945         uaddw2          \coef1, \coef1, v4.8h
 946         ld1             {v6.4h},   [x0], x1
 947         srshr           \coef5, \coef5, #6
 948         uaddw           \coef2, \coef2, v5.4h
 949         ld1             {v6.d}[1], [x3], x1
 950         sqxtun          v4.4h,  \coef0
 951         srshr           \coef6, \coef6, #6
 952         uaddw2          \coef3, \coef3, v5.8h
 953         ld1             {v7.4h},   [x0], x1
 954         sqxtun2         v4.8h,  \coef1
 955         srshr           \coef7, \coef7, #6
 956         uaddw           \coef4, \coef4, v6.4h
 957         ld1             {v7.d}[1], [x3], x1
 958         umin            v4.8h,  v4.8h,  v8.8h
 959         sub             x0,  x0,  x1, lsl #2
 960         sub             x3,  x3,  x1, lsl #2
 961         sqxtun          v5.4h,  \coef2
 962         uaddw2          \coef5, \coef5, v6.8h
 963         st1             {v4.4h},   [x0], x1
 964         sqxtun2         v5.8h,  \coef3
 965         uaddw           \coef6, \coef6, v7.4h
 966         st1             {v4.d}[1], [x3], x1
 967         umin            v5.8h,  v5.8h,  v8.8h
 968         sqxtun          v6.4h,  \coef4
 969         uaddw2          \coef7, \coef7, v7.8h
 970         st1             {v5.4h},   [x0], x1
 971         sqxtun2         v6.8h,  \coef5
 972         st1             {v5.d}[1], [x3], x1
 973         umin            v6.8h,  v6.8h,  v8.8h
 974         sqxtun          v7.4h,  \coef6
 975         st1             {v6.4h},   [x0], x1
 976         sqxtun2         v7.8h,  \coef7
 977         st1             {v6.d}[1], [x3], x1
 978         umin            v7.8h,  v7.8h,  v8.8h
 979         st1             {v7.4h},   [x0], x1
 980         st1             {v7.d}[1], [x3], x1
 981 .endm
 982         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
 983         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
 984 .purgem load_add_store
 985
 986         ret
 987 endfunc
 988 .endm
 989
 990 itxfm16_1d_funcs idct
 991 itxfm16_1d_funcs iadst
 992
 993 // This is the minimum eob value for each subpartition, in increments of 4
 994 const min_eob_idct_idct_16, align=4
 995         .short  0, 10, 38, 89
 996 endconst
 997
 998 .macro itxfm_func16x16 txfm1, txfm2
 999 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1000 .ifc \txfm1\()_\txfm2,idct_idct
1001         cmp             w3,  #1
1002         b.eq            idct16x16_dc_add_neon
1003 .endif
1004         mov             x15, x30
1005         // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1006 .ifnc \txfm1\()_\txfm2,idct_idct
1007         stp             d14, d15, [sp, #-0x10]!
1008         stp             d12, d13, [sp, #-0x10]!
1009         stp             d10, d11, [sp, #-0x10]!
1010 .endif
1011         stp             d8,  d9,  [sp, #-0x10]!
1012
1013         sub             sp,  sp,  #1024
1014
1015         mov             x4,  x0
1016         mov             x5,  x1
1017         mov             x6,  x2
1018
1019         movrel          x10, idct_coeffs
1020 .ifnc \txfm1\()_\txfm2,idct_idct
1021         movrel          x11, iadst16_coeffs
1022 .endif
1023         movrel          x12, min_eob_idct_idct_16, 2
1024 .ifc \txfm1,idct
1025         ld1             {v0.8h,v1.8h}, [x10]
1026         sxtl            v2.4s,  v1.4h
1027         sxtl2           v3.4s,  v1.8h
1028         sxtl2           v1.4s,  v0.8h
1029         sxtl            v0.4s,  v0.4h
1030 .endif
1031         mov             x9,  #64
1032
1033 .irp i, 0, 4, 8, 12
1034         add             x0,  sp,  #(\i*64)
1035 .ifc \txfm1\()_\txfm2,idct_idct
1036 .if \i > 0
1037         ldrh            w1,  [x12], #2
1038         cmp             w3,  w1
1039         mov             x1,  #(16 - \i)/4
1040         b.le            1f
1041 .endif
1042 .endif
1043         mov             x1,  #\i
1044         add             x2,  x6,  #(\i*4)
1045         bl              \txfm1\()16_1d_4x16_pass1_neon
1046 .endr
1047 .ifc \txfm1\()_\txfm2,iadst_idct
1048         ld1             {v0.8h,v1.8h}, [x10]
1049         sxtl            v2.4s,  v1.4h
1050         sxtl2           v3.4s,  v1.8h
1051         sxtl2           v1.4s,  v0.8h
1052         sxtl            v0.4s,  v0.4h
1053 .endif
1054
1055 .ifc \txfm1\()_\txfm2,idct_idct
1056         b               3f
1057 1:
1058         // Set v28-v31 to zero, for the in-register passthrough of
1059         // coefficients to pass 2.
1060         movi            v28.4s,  #0
1061         movi            v29.4s,  #0
1062         movi            v30.4s,  #0
1063         movi            v31.4s,  #0
1064 2:
1065         subs            x1,  x1,  #1
1066 .rept 4
1067         st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1068 .endr
1069         b.ne            2b
1070 3:
1071 .endif
1072
1073 .irp i, 0, 4, 8, 12
1074         add             x0,  x4,  #(\i*2)
1075         mov             x1,  x5
1076         add             x2,  sp,  #(\i*4)
1077         mov             x3,  #\i
1078         bl              \txfm2\()16_1d_4x16_pass2_neon
1079 .endr
1080
1081         add             sp,  sp,  #1024
1082         ldp             d8,  d9,  [sp], 0x10
1083 .ifnc \txfm1\()_\txfm2,idct_idct
1084         ldp             d10, d11, [sp], 0x10
1085         ldp             d12, d13, [sp], 0x10
1086         ldp             d14, d15, [sp], 0x10
1087 .endif
1088         br              x15
1089 endfunc
1090
1091 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1092         mov             x13, #0x03ff
1093         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1094 endfunc
1095
1096 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1097         mov             x13, #0x0fff
1098         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1099 endfunc
1100 .endm
1101
1102 itxfm_func16x16 idct,  idct
1103 itxfm_func16x16 iadst, idct
1104 itxfm_func16x16 idct,  iadst
1105 itxfm_func16x16 iadst, iadst
1106
1107
1108 function idct32x32_dc_add_neon
1109         movrel          x4,  idct_coeffs
1110         ld1             {v0.4h}, [x4]
1111         sxtl            v0.4s,  v0.4h
1112
1113         movi            v1.4h,  #0
1114
1115         ld1             {v2.s}[0],  [x2]
1116         smull           v2.2d,  v2.2s,  v0.s[0]
1117         rshrn           v2.2s,  v2.2d,  #14
1118         smull           v2.2d,  v2.2s,  v0.s[0]
1119         rshrn           v2.2s,  v2.2d,  #14
1120         st1             {v1.s}[0],  [x2]
1121         dup             v2.4s,  v2.s[0]
1122
1123         srshr           v0.4s,  v2.4s,  #6
1124
1125         mov             x3,  x0
1126         mov             x4,  #32
1127         sub             x1,  x1,  #32
1128         dup             v31.8h, w13
1129 1:
1130         // Loop to add the constant v0 into all 32x32 outputs
1131         subs            x4,  x4,  #1
1132         ld1             {v1.8h,v2.8h},  [x0], #32
1133         uaddw           v16.4s, v0.4s,  v1.4h
1134         uaddw2          v17.4s, v0.4s,  v1.8h
1135         ld1             {v3.8h,v4.8h},  [x0], x1
1136         uaddw           v18.4s, v0.4s,  v2.4h
1137         uaddw2          v19.4s, v0.4s,  v2.8h
1138         uaddw           v20.4s, v0.4s,  v3.4h
1139         uaddw2          v21.4s, v0.4s,  v3.8h
1140         uaddw           v22.4s, v0.4s,  v4.4h
1141         uaddw2          v23.4s, v0.4s,  v4.8h
1142         sqxtun          v1.4h,  v16.4s
1143         sqxtun2         v1.8h,  v17.4s
1144         sqxtun          v2.4h,  v18.4s
1145         sqxtun2         v2.8h,  v19.4s
1146         sqxtun          v3.4h,  v20.4s
1147         sqxtun2         v3.8h,  v21.4s
1148         sqxtun          v4.4h,  v22.4s
1149         sqxtun2         v4.8h,  v23.4s
1150         umin            v1.8h,  v1.8h,  v31.8h
1151         umin            v2.8h,  v2.8h,  v31.8h
1152         st1             {v1.8h,v2.8h},  [x3], #32
1153         umin            v3.8h,  v3.8h,  v31.8h
1154         umin            v4.8h,  v4.8h,  v31.8h
1155         st1             {v3.8h,v4.8h},  [x3], x1
1156         b.ne            1b
1157
1158         ret
1159 endfunc
1160
1161 .macro idct32_odd
1162         dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1163         dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1164         dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1165         dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1166         dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1167         dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1168         dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1169         dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1170
1171         butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1172         butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1173         butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1174         butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1175         butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1176         butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1177         butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1178         butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1179
1180         dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1181         dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1182         dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1183         dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1184
1185         butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1186         butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1187         butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1188         butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1189         butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1190         butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1191         butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
1192         butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1193
1194         dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1195         dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1196         dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1197         dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1198
1199         butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1200         butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1201         butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1202         butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1203         butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1204         butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1205         butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1206         butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
1207
1208         dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
1209         dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1210         dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
1211         dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1212 .endm
1213
1214 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1215 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1216 // a normal IDCT16 with every other input component (the even ones, with
1217 // each output written twice), followed by a separate 16-point IDCT
1218 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1219 // x0 = dst (temp buffer)
1220 // x1 = unused
1221 // x2 = src
1222 // x9 = double input stride
1223 function idct32_1d_4x32_pass1_neon
1224         movi            v4.4s,  #0
1225
1226         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1227 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1228         ld1             {v\i\().4s}, [x2]
1229         st1             {v4.4s},  [x2], x9
1230 .endr
1231
1232         idct16
1233
1234         // Do four 4x4 transposes. Originally, v16-v31 contain the
1235         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1236         // contain the four transposed 4x4 blocks.
1237         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1238         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1239         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1240         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1241
1242         // Store the registers a, b, c, d horizontally, followed by the
1243         // same registers d, c, b, a mirrored.
1244 .macro store_rev a, b, c, d
1245         // There's no rev128 instruction, but we reverse each 64 bit
1246         // half, and then flip them using an ext with 8 bytes offset.
1247         rev64           v7.4s, v\d\().4s
1248         st1             {v\a\().4s},  [x0], #16
1249         ext             v7.16b, v7.16b, v7.16b, #8
1250         st1             {v\b\().4s},  [x0], #16
1251         rev64           v6.4s, v\c\().4s
1252         st1             {v\c\().4s},  [x0], #16
1253         ext             v6.16b, v6.16b, v6.16b, #8
1254         st1             {v\d\().4s},  [x0], #16
1255         rev64           v5.4s, v\b\().4s
1256         st1             {v7.4s},  [x0], #16
1257         ext             v5.16b, v5.16b, v5.16b, #8
1258         st1             {v6.4s},  [x0], #16
1259         rev64           v4.4s, v\a\().4s
1260         st1             {v5.4s},  [x0], #16
1261         ext             v4.16b, v4.16b, v4.16b, #8
1262         st1             {v4.4s},  [x0], #16
1263 .endm
1264         store_rev       16, 20, 24, 28
1265         store_rev       17, 21, 25, 29
1266         store_rev       18, 22, 26, 30
1267         store_rev       19, 23, 27, 31
1268         sub             x0,  x0,  #512
1269 .purgem store_rev
1270
1271         // Move x2 back to the start of the input, and move
1272         // to the first odd row
1273         sub             x2,  x2,  x9, lsl #4
1274         add             x2,  x2,  #128
1275
1276         movi            v4.4s,  #0
1277         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1278 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1279         ld1             {v\i\().4s}, [x2]
1280         st1             {v4.4s},  [x2], x9
1281 .endr
1282
1283         idct32_odd
1284
1285         transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
1286         transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
1287         transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
1288         transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
1289
1290         // Store the registers a, b, c, d horizontally,
1291         // adding into the output first, and the mirrored,
1292         // subtracted from the output.
1293 .macro store_rev a, b, c, d
1294         ld1             {v4.4s},  [x0]
1295         rev64           v9.4s, v\d\().4s
1296         add             v4.4s, v4.4s, v\a\().4s
1297         st1             {v4.4s},  [x0], #16
1298         rev64           v8.4s, v\c\().4s
1299         ld1             {v4.4s},  [x0]
1300         ext             v9.16b, v9.16b, v9.16b, #8
1301         add             v4.4s, v4.4s, v\b\().4s
1302         st1             {v4.4s},  [x0], #16
1303         ext             v8.16b, v8.16b, v8.16b, #8
1304         ld1             {v4.4s},  [x0]
1305         rev64           v\b\().4s, v\b\().4s
1306         add             v4.4s, v4.4s, v\c\().4s
1307         st1             {v4.4s},  [x0], #16
1308         rev64           v\a\().4s, v\a\().4s
1309         ld1             {v4.4s},  [x0]
1310         ext             v\b\().16b, v\b\().16b, v\b\().16b, #8
1311         add             v4.4s, v4.4s, v\d\().4s
1312         st1             {v4.4s},  [x0], #16
1313         ext             v\a\().16b, v\a\().16b, v\a\().16b, #8
1314         ld1             {v4.4s},  [x0]
1315         sub             v4.4s, v4.4s, v9.4s
1316         st1             {v4.4s},  [x0], #16
1317         ld1             {v4.4s},  [x0]
1318         sub             v4.4s, v4.4s, v8.4s
1319         st1             {v4.4s},  [x0], #16
1320         ld1             {v4.4s},  [x0]
1321         sub             v4.4s, v4.4s, v\b\().4s
1322         st1             {v4.4s},  [x0], #16
1323         ld1             {v4.4s},  [x0]
1324         sub             v4.4s, v4.4s, v\a\().4s
1325         st1             {v4.4s},  [x0], #16
1326 .endm
1327
1328         store_rev       31, 27, 23, 19
1329         store_rev       30, 26, 22, 18
1330         store_rev       29, 25, 21, 17
1331         store_rev       28, 24, 20, 16
1332 .purgem store_rev
1333         ret
1334 endfunc
1335
1336 // This is mostly the same as 4x32_pass1, but without the transpose,
1337 // and use the source as temp buffer between the two idct passes, and
1338 // add into the destination.
1339 // x0 = dst
1340 // x1 = dst stride
1341 // x2 = src (temp buffer)
1342 // x7 = negative double temp buffer stride
1343 // x9 = double temp buffer stride
1344 function idct32_1d_4x32_pass2_neon
1345         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1346 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1347         ld1             {v\i\().4s}, [x2], x9
1348 .endr
1349         sub             x2,  x2,  x9, lsl #4
1350
1351         idct16
1352
1353 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1354         st1             {v\i\().4s}, [x2], x9
1355 .endr
1356
1357         sub             x2,  x2,  x9, lsl #4
1358         add             x2,  x2,  #128
1359
1360         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1361 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1362         ld1             {v\i\().4s}, [x2], x9
1363 .endr
1364         sub             x2,  x2,  x9, lsl #4
1365         sub             x2,  x2,  #128
1366
1367         idct32_odd
1368
1369 .macro load_acc_store a, b, c, d, neg=0
1370 .if \neg == 0
1371         ld1             {v4.4s},  [x2], x9
1372         ld1             {v5.4s},  [x2], x9
1373         add             v4.4s, v4.4s, v\a\().4s
1374         ld1             {v6.4s},  [x2], x9
1375         add             v5.4s, v5.4s, v\b\().4s
1376         ld1             {v7.4s},  [x2], x9
1377         add             v6.4s, v6.4s, v\c\().4s
1378         add             v7.4s, v7.4s, v\d\().4s
1379 .else
1380         ld1             {v4.4s},  [x2], x7
1381         ld1             {v5.4s},  [x2], x7
1382         sub             v4.4s, v4.4s, v\a\().4s
1383         ld1             {v6.4s},  [x2], x7
1384         sub             v5.4s, v5.4s, v\b\().4s
1385         ld1             {v7.4s},  [x2], x7
1386         sub             v6.4s, v6.4s, v\c\().4s
1387         sub             v7.4s, v7.4s, v\d\().4s
1388 .endif
1389         ld1             {v8.4h},   [x0], x1
1390         ld1             {v8.d}[1], [x0], x1
1391         srshr           v4.4s, v4.4s, #6
1392         ld1             {v9.4h},   [x0], x1
1393         srshr           v5.4s, v5.4s, #6
1394         uaddw           v4.4s, v4.4s, v8.4h
1395         ld1             {v9.d}[1], [x0], x1
1396         srshr           v6.4s, v6.4s, #6
1397         uaddw2          v5.4s, v5.4s, v8.8h
1398         srshr           v7.4s, v7.4s, #6
1399         sub             x0,  x0,  x1, lsl #2
1400         uaddw           v6.4s, v6.4s, v9.4h
1401         sqxtun          v4.4h, v4.4s
1402         uaddw2          v7.4s, v7.4s, v9.8h
1403         sqxtun2         v4.8h, v5.4s
1404         umin            v4.8h, v4.8h, v15.8h
1405         st1             {v4.4h},   [x0], x1
1406         sqxtun          v5.4h, v6.4s
1407         st1             {v4.d}[1], [x0], x1
1408         sqxtun2         v5.8h, v7.4s
1409         umin            v5.8h, v5.8h, v15.8h
1410         st1             {v5.4h},   [x0], x1
1411         st1             {v5.d}[1], [x0], x1
1412 .endm
1413         load_acc_store  31, 30, 29, 28
1414         load_acc_store  27, 26, 25, 24
1415         load_acc_store  23, 22, 21, 20
1416         load_acc_store  19, 18, 17, 16
1417         sub             x2,  x2,  x9
1418         load_acc_store  16, 17, 18, 19, 1
1419         load_acc_store  20, 21, 22, 23, 1
1420         load_acc_store  24, 25, 26, 27, 1
1421         load_acc_store  28, 29, 30, 31, 1
1422 .purgem load_acc_store
1423         ret
1424 endfunc
1425
1426 const min_eob_idct_idct_32, align=4
1427         .short  0, 9, 34, 70, 135, 240, 336, 448
1428 endconst
1429
1430 function vp9_idct_idct_32x32_add_16_neon
1431         cmp             w3,  #1
1432         b.eq            idct32x32_dc_add_neon
1433
1434         movrel          x10, idct_coeffs
1435         movrel          x12, min_eob_idct_idct_32, 2
1436
1437         mov             x15, x30
1438         stp             d8,  d9,  [sp, #-0x10]!
1439         stp             d10, d11, [sp, #-0x10]!
1440         stp             d12, d13, [sp, #-0x10]!
1441         stp             d14, d15, [sp, #-0x10]!
1442
1443         sub             sp,  sp,  #4096
1444
1445         mov             x4,  x0
1446         mov             x5,  x1
1447         mov             x6,  x2
1448
1449         // Double stride of the input, since we only read every other line
1450         mov             x9,  #256
1451         neg             x7,  x9
1452
1453         ld1             {v0.8h,v1.8h},   [x10], #32
1454         sxtl            v2.4s,  v1.4h
1455         sxtl2           v3.4s,  v1.8h
1456         sxtl2           v1.4s,  v0.8h
1457         sxtl            v0.4s,  v0.4h
1458         ld1             {v10.8h,v11.8h}, [x10]
1459         sxtl            v12.4s, v11.4h
1460         sxtl2           v13.4s, v11.8h
1461         sxtl2           v11.4s, v10.8h
1462         sxtl            v10.4s, v10.4h
1463
1464         dup             v15.8h, w13
1465
1466 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1467         add             x0,  sp,  #(\i*128)
1468 .if \i > 0
1469         ldrh            w1,  [x12], #2
1470         cmp             w3,  w1
1471         mov             x1,  #(32 - \i)/4
1472         b.le            1f
1473 .endif
1474         add             x2,  x6,  #(\i*4)
1475         bl              idct32_1d_4x32_pass1_neon
1476 .endr
1477         b               3f
1478
1479 1:
1480         // Write zeros to the temp buffer for pass 2
1481         movi            v16.4s,  #0
1482         movi            v17.4s,  #0
1483         movi            v18.4s,  #0
1484         movi            v19.4s,  #0
1485 2:
1486         subs            x1,  x1,  #1
1487 .rept 4
1488         st1             {v16.4s-v19.4s},  [x0], #64
1489         st1             {v16.4s-v19.4s},  [x0], #64
1490 .endr
1491         b.ne            2b
1492 3:
1493 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1494         add             x0,  x4,  #(\i*2)
1495         mov             x1,  x5
1496         add             x2,  sp,  #(\i*4)
1497         bl              idct32_1d_4x32_pass2_neon
1498 .endr
1499
1500         add             sp,  sp,  #4096
1501         ldp             d14, d15, [sp], 0x10
1502         ldp             d12, d13, [sp], 0x10
1503         ldp             d10, d11, [sp], 0x10
1504         ldp             d8,  d9,  [sp], 0x10
1505
1506         br              x15
1507 endfunc
1508
1509 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1510         mov             x13, #0x03ff
1511         b               vp9_idct_idct_32x32_add_16_neon
1512 endfunc
1513
1514 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1515         mov             x13, #0x0fff
1516         b               vp9_idct_idct_32x32_add_16_neon
1517 endfunc