git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_16bpp_neon.S

   1 /*
   2  * Copyright (c) 2017 Google Inc.
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22 #include "neon.S"
  23
  24 const itxfm4_coeffs, align=4
  25         .short  11585, 0, 6270, 15137
  26 iadst4_coeffs:
  27         .short  5283, 15212, 9929, 13377
  28 endconst
  29
  30 const iadst8_coeffs, align=4
  31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
  32 idct_coeffs:
  33         .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
  34         .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
  35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
  36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
  37 endconst
  38
  39 const iadst16_coeffs, align=4
  40         .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
  41         .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
  42 endconst
  43
  44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
  45         trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
  46         trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
  47         trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
  48         trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
  49         trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
  50         trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
  51         trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
  52         trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
  53 .endm
  54
  55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
  56 // over two registers.
  57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
  58         transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
  59         transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
  60
  61         // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
  62         // while swapping the two 4x4 matrices between each other
  63
  64         // First step of the 4x4 transpose of r1-r7, into t0-t3
  65         trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
  66         trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
  67         trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
  68         trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
  69
  70         // First step of the 4x4 transpose of r8-r12, into r1-r7
  71         trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
  72         trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
  73         trn1            \r5\().4s,  \r12\().4s, \r14\().4s
  74         trn2            \r7\().4s,  \r12\().4s, \r14\().4s
  75
  76         // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
  77         trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
  78         trn2            \r12\().2d, \t0\().2d,  \t2\().2d
  79         trn1            \r10\().2d, \t1\().2d,  \t3\().2d
  80         trn2            \r14\().2d, \t1\().2d,  \t3\().2d
  81
  82         // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
  83         trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
  84         trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
  85         trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
  86         trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
  87
  88         // Move the outputs of trn1 back in place
  89         mov             \r1\().16b,  \t0\().16b
  90         mov             \r3\().16b,  \t1\().16b
  91 .endm
  92
  93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
  94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
  95 // in/out are .4s registers; this can do with 4 temp registers, but is
  96 // more efficient if 6 temp registers are available.
  97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
  98 .if \neg > 0
  99         neg             \tmp4\().4s, v0.4s
 100 .endif
 101         add             \tmp1\().4s, \in1\().4s,  \in2\().4s
 102         sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
 103 .if \neg > 0
 104         smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
 105         smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
 106 .else
 107         smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
 108         smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
 109 .endif
 110 .ifb \tmp5
 111         rshrn           \out1\().2s, \tmp3\().2d, #14
 112         rshrn2          \out1\().4s, \tmp4\().2d, #14
 113         smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
 114         smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
 115         rshrn           \out2\().2s, \tmp3\().2d, #14
 116         rshrn2          \out2\().4s, \tmp4\().2d, #14
 117 .else
 118         smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
 119         smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
 120         rshrn           \out1\().2s, \tmp3\().2d, #14
 121         rshrn2          \out1\().4s, \tmp4\().2d, #14
 122         rshrn           \out2\().2s, \tmp5\().2d, #14
 123         rshrn2          \out2\().4s, \tmp6\().2d, #14
 124 .endif
 125 .endm
 126
 127 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
 128 // writing the same output into both out1 and out2.
 129 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
 130         smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
 131         smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
 132         rshrn           \out1\().2s, \tmp1\().2d, #14
 133         rshrn2          \out1\().4s, \tmp2\().2d, #14
 134         rshrn           \out2\().2s, \tmp1\().2d, #14
 135         rshrn2          \out2\().4s, \tmp2\().2d, #14
 136 .endm
 137
 138 // out1,out2 = in1 * coef1 - in2 * coef2
 139 // out3,out4 = in1 * coef2 + in2 * coef1
 140 // out are 4 x .2d registers, in are 2 x .4s registers
 141 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
 142         smull           \out1\().2d, \in1\().2s, \coef1
 143         smull2          \out2\().2d, \in1\().4s, \coef1
 144         smull           \out3\().2d, \in1\().2s, \coef2
 145         smull2          \out4\().2d, \in1\().4s, \coef2
 146         smlsl           \out1\().2d, \in2\().2s, \coef2
 147         smlsl2          \out2\().2d, \in2\().4s, \coef2
 148         smlal           \out3\().2d, \in2\().2s, \coef1
 149         smlal2          \out4\().2d, \in2\().4s, \coef1
 150 .endm
 151
 152 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
 153 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
 154 // inout are 2 x .4s registers
 155 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
 156         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
 157 .if \neg > 0
 158         neg             \tmp3\().2d, \tmp3\().2d
 159         neg             \tmp4\().2d, \tmp4\().2d
 160 .endif
 161         rshrn           \inout1\().2s, \tmp1\().2d,  #14
 162         rshrn2          \inout1\().4s, \tmp2\().2d,  #14
 163         rshrn           \inout2\().2s, \tmp3\().2d,  #14
 164         rshrn2          \inout2\().4s, \tmp4\().2d,  #14
 165 .endm
 166
 167 // Same as dmbutterfly above, but treating the input in inout2 as zero
 168 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
 169         smull           \tmp1\().2d, \inout1\().2s, \coef1
 170         smull2          \tmp2\().2d, \inout1\().4s, \coef1
 171         smull           \tmp3\().2d, \inout1\().2s, \coef2
 172         smull2          \tmp4\().2d, \inout1\().4s, \coef2
 173         rshrn           \inout1\().2s, \tmp1\().2d, #14
 174         rshrn2          \inout1\().4s, \tmp2\().2d, #14
 175         rshrn           \inout2\().2s, \tmp3\().2d, #14
 176         rshrn2          \inout2\().4s, \tmp4\().2d, #14
 177 .endm
 178
 179 // Same as dmbutterfly above, but treating the input in inout1 as zero
 180 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
 181         smull           \tmp1\().2d, \inout2\().2s, \coef2
 182         smull2          \tmp2\().2d, \inout2\().4s, \coef2
 183         smull           \tmp3\().2d, \inout2\().2s, \coef1
 184         smull2          \tmp4\().2d, \inout2\().4s, \coef1
 185         neg             \tmp1\().2d, \tmp1\().2d
 186         neg             \tmp2\().2d, \tmp2\().2d
 187         rshrn           \inout2\().2s, \tmp3\().2d, #14
 188         rshrn2          \inout2\().4s, \tmp4\().2d, #14
 189         rshrn           \inout1\().2s, \tmp1\().2d, #14
 190         rshrn2          \inout1\().4s, \tmp2\().2d, #14
 191 .endm
 192
 193 .macro dsmull_h out1, out2, in, coef
 194         smull           \out1\().2d, \in\().2s, \coef
 195         smull2          \out2\().2d, \in\().4s, \coef
 196 .endm
 197
 198 .macro drshrn_h out, in1, in2, shift
 199         rshrn           \out\().2s, \in1\().2d, \shift
 200         rshrn2          \out\().4s, \in2\().2d, \shift
 201 .endm
 202
 203
 204 // out1 = in1 + in2
 205 // out2 = in1 - in2
 206 .macro butterfly_4s out1, out2, in1, in2
 207         add             \out1\().4s, \in1\().4s, \in2\().4s
 208         sub             \out2\().4s, \in1\().4s, \in2\().4s
 209 .endm
 210
 211 // out1 = in1 - in2
 212 // out2 = in1 + in2
 213 .macro butterfly_4s_r out1, out2, in1, in2
 214         sub             \out1\().4s, \in1\().4s, \in2\().4s
 215         add             \out2\().4s, \in1\().4s, \in2\().4s
 216 .endm
 217
 218 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
 219 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
 220 // out are 2 x .4s registers, in are 4 x .2d registers
 221 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
 222         add             \tmp1\().2d, \in1\().2d, \in3\().2d
 223         add             \tmp2\().2d, \in2\().2d, \in4\().2d
 224         sub             \tmp3\().2d, \in1\().2d, \in3\().2d
 225         sub             \tmp4\().2d, \in2\().2d, \in4\().2d
 226         rshrn           \out1\().2s, \tmp1\().2d,  #14
 227         rshrn2          \out1\().4s, \tmp2\().2d,  #14
 228         rshrn           \out2\().2s, \tmp3\().2d,  #14
 229         rshrn2          \out2\().4s, \tmp4\().2d,  #14
 230 .endm
 231
 232 .macro iwht4_10 c0, c1, c2, c3
 233         add             \c0\().4s, \c0\().4s, \c1\().4s
 234         sub             v17.4s,    \c2\().4s, \c3\().4s
 235         sub             v16.4s,    \c0\().4s, v17.4s
 236         sshr            v16.4s,    v16.4s,    #1
 237         sub             \c2\().4s, v16.4s,    \c1\().4s
 238         sub             \c1\().4s, v16.4s,    \c3\().4s
 239         add             \c3\().4s, v17.4s,    \c2\().4s
 240         sub             \c0\().4s, \c0\().4s, \c1\().4s
 241 .endm
 242
 243 .macro iwht4_12 c0, c1, c2, c3
 244         iwht4_10        \c0, \c1, \c2, \c3
 245 .endm
 246
 247 .macro idct4_10 c0, c1, c2, c3
 248         mul             v22.4s,    \c1\().4s, v0.s[3]
 249         mul             v20.4s,    \c1\().4s, v0.s[2]
 250         add             v16.4s,    \c0\().4s, \c2\().4s
 251         sub             v17.4s,    \c0\().4s, \c2\().4s
 252         mla             v22.4s,    \c3\().4s, v0.s[2]
 253         mul             v18.4s,    v16.4s,    v0.s[0]
 254         mul             v24.4s,    v17.4s,    v0.s[0]
 255         mls             v20.4s,    \c3\().4s, v0.s[3]
 256         srshr           v22.4s,    v22.4s,    #14
 257         srshr           v18.4s,    v18.4s,    #14
 258         srshr           v24.4s,    v24.4s,    #14
 259         srshr           v20.4s,    v20.4s,    #14
 260         add             \c0\().4s, v18.4s,    v22.4s
 261         sub             \c3\().4s, v18.4s,    v22.4s
 262         add             \c1\().4s, v24.4s,    v20.4s
 263         sub             \c2\().4s, v24.4s,    v20.4s
 264 .endm
 265
 266 .macro idct4_12 c0, c1, c2, c3
 267         smull           v22.2d,    \c1\().2s, v0.s[3]
 268         smull2          v23.2d,    \c1\().4s, v0.s[3]
 269         smull           v20.2d,    \c1\().2s, v0.s[2]
 270         smull2          v21.2d,    \c1\().4s, v0.s[2]
 271         add             v16.4s,    \c0\().4s, \c2\().4s
 272         sub             v17.4s,    \c0\().4s, \c2\().4s
 273         smlal           v22.2d,    \c3\().2s, v0.s[2]
 274         smlal2          v23.2d,    \c3\().4s, v0.s[2]
 275         smull           v18.2d,    v16.2s,    v0.s[0]
 276         smull2          v19.2d,    v16.4s,    v0.s[0]
 277         smull           v24.2d,    v17.2s,    v0.s[0]
 278         smull2          v25.2d,    v17.4s,    v0.s[0]
 279         smlsl           v20.2d,    \c3\().2s, v0.s[3]
 280         smlsl2          v21.2d,    \c3\().4s, v0.s[3]
 281         rshrn           v22.2s,    v22.2d,    #14
 282         rshrn2          v22.4s,    v23.2d,    #14
 283         rshrn           v18.2s,    v18.2d,    #14
 284         rshrn2          v18.4s,    v19.2d,    #14
 285         rshrn           v24.2s,    v24.2d,    #14
 286         rshrn2          v24.4s,    v25.2d,    #14
 287         rshrn           v20.2s,    v20.2d,    #14
 288         rshrn2          v20.4s,    v21.2d,    #14
 289         add             \c0\().4s, v18.4s,    v22.4s
 290         sub             \c3\().4s, v18.4s,    v22.4s
 291         add             \c1\().4s, v24.4s,    v20.4s
 292         sub             \c2\().4s, v24.4s,    v20.4s
 293 .endm
 294
 295 .macro iadst4_10 c0, c1, c2, c3
 296         mul             v16.4s,    \c0\().4s, v1.s[0]
 297         mla             v16.4s,    \c2\().4s, v1.s[1]
 298         mla             v16.4s,    \c3\().4s, v1.s[2]
 299         mul             v18.4s,    \c0\().4s, v1.s[2]
 300         mls             v18.4s,    \c2\().4s, v1.s[0]
 301         sub             \c0\().4s, \c0\().4s, \c2\().4s
 302         mls             v18.4s,    \c3\().4s, v1.s[1]
 303         add             \c0\().4s, \c0\().4s, \c3\().4s
 304         mul             v22.4s,    \c1\().4s, v1.s[3]
 305         mul             v20.4s,    \c0\().4s, v1.s[3]
 306         add             v24.4s,    v16.4s,    v22.4s
 307         add             v26.4s,    v18.4s,    v22.4s
 308         srshr           \c0\().4s, v24.4s,    #14
 309         add             v16.4s,    v16.4s,    v18.4s
 310         srshr           \c1\().4s, v26.4s,    #14
 311         sub             v16.4s,    v16.4s,    v22.4s
 312         srshr           \c2\().4s, v20.4s,    #14
 313         srshr           \c3\().4s, v16.4s,    #14
 314 .endm
 315
 316 .macro iadst4_12 c0, c1, c2, c3
 317         smull           v16.2d,    \c0\().2s, v1.s[0]
 318         smull2          v17.2d,    \c0\().4s, v1.s[0]
 319         smlal           v16.2d,    \c2\().2s, v1.s[1]
 320         smlal2          v17.2d,    \c2\().4s, v1.s[1]
 321         smlal           v16.2d,    \c3\().2s, v1.s[2]
 322         smlal2          v17.2d,    \c3\().4s, v1.s[2]
 323         smull           v18.2d,    \c0\().2s, v1.s[2]
 324         smull2          v19.2d,    \c0\().4s, v1.s[2]
 325         smlsl           v18.2d,    \c2\().2s, v1.s[0]
 326         smlsl2          v19.2d,    \c2\().4s, v1.s[0]
 327         sub             \c0\().4s, \c0\().4s, \c2\().4s
 328         smlsl           v18.2d,    \c3\().2s, v1.s[1]
 329         smlsl2          v19.2d,    \c3\().4s, v1.s[1]
 330         add             \c0\().4s, \c0\().4s, \c3\().4s
 331         smull           v22.2d,    \c1\().2s, v1.s[3]
 332         smull2          v23.2d,    \c1\().4s, v1.s[3]
 333         smull           v20.2d,    \c0\().2s, v1.s[3]
 334         smull2          v21.2d,    \c0\().4s, v1.s[3]
 335         add             v24.2d,    v16.2d,    v22.2d
 336         add             v25.2d,    v17.2d,    v23.2d
 337         add             v26.2d,    v18.2d,    v22.2d
 338         add             v27.2d,    v19.2d,    v23.2d
 339         rshrn           \c0\().2s, v24.2d,    #14
 340         rshrn2          \c0\().4s, v25.2d,    #14
 341         add             v16.2d,    v16.2d,    v18.2d
 342         add             v17.2d,    v17.2d,    v19.2d
 343         rshrn           \c1\().2s, v26.2d,    #14
 344         rshrn2          \c1\().4s, v27.2d,    #14
 345         sub             v16.2d,    v16.2d,    v22.2d
 346         sub             v17.2d,    v17.2d,    v23.2d
 347         rshrn           \c2\().2s, v20.2d,    #14
 348         rshrn2          \c2\().4s, v21.2d,    #14
 349         rshrn           \c3\().2s, v16.2d,    #14
 350         rshrn2          \c3\().4s, v17.2d,    #14
 351 .endm
 352
 353 // The public functions in this file have got the following signature:
 354 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
 355
 356 .macro itxfm_func4x4 txfm1, txfm2, bpp
 357 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
 358 .ifc \txfm1,\txfm2
 359 .ifc \txfm1,idct
 360         movrel          x4,  itxfm4_coeffs
 361         ld1             {v0.4h}, [x4]
 362         sxtl            v0.4s,  v0.4h
 363 .endif
 364 .ifc \txfm1,iadst
 365         movrel          x4,  iadst4_coeffs
 366         ld1             {v0.d}[1], [x4]
 367         sxtl2           v1.4s,  v0.8h
 368 .endif
 369 .else
 370         movrel          x4,  itxfm4_coeffs
 371         ld1             {v0.8h}, [x4]
 372         sxtl2           v1.4s,  v0.8h
 373         sxtl            v0.4s,  v0.4h
 374 .endif
 375
 376         movi            v30.4s, #0
 377         movi            v31.4s, #0
 378 .ifc \txfm1\()_\txfm2,idct_idct
 379         cmp             w3,  #1
 380         b.ne            1f
 381         // DC-only for idct/idct
 382         ld1             {v2.s}[0],  [x2]
 383         smull           v2.2d,  v2.2s, v0.s[0]
 384         rshrn           v2.2s,  v2.2d, #14
 385         smull           v2.2d,  v2.2s, v0.s[0]
 386         rshrn           v2.2s,  v2.2d, #14
 387         st1             {v31.s}[0], [x2]
 388         dup             v4.4s,  v2.s[0]
 389         mov             v5.16b, v4.16b
 390         mov             v6.16b, v4.16b
 391         mov             v7.16b, v4.16b
 392         b               2f
 393 .endif
 394
 395 1:
 396         ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
 397         st1             {v30.4s,v31.4s}, [x2], #32
 398
 399 .ifc \txfm1,iwht
 400         sshr            v4.4s,  v4.4s,  #2
 401         sshr            v5.4s,  v5.4s,  #2
 402         sshr            v6.4s,  v6.4s,  #2
 403         sshr            v7.4s,  v7.4s,  #2
 404 .endif
 405
 406         \txfm1\()4_\bpp v4,  v5,  v6,  v7
 407
 408         st1             {v30.4s,v31.4s}, [x2], #32
 409         // Transpose 4x4 with 32 bit elements
 410         transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 411
 412         \txfm2\()4_\bpp v4,  v5,  v6,  v7
 413 2:
 414         mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
 415         ld1             {v0.4h},   [x0], x1
 416         ld1             {v1.4h},   [x0], x1
 417 .ifnc \txfm1,iwht
 418         srshr           v4.4s,  v4.4s,  #4
 419         srshr           v5.4s,  v5.4s,  #4
 420         srshr           v6.4s,  v6.4s,  #4
 421         srshr           v7.4s,  v7.4s,  #4
 422 .endif
 423         uaddw           v4.4s,  v4.4s,  v0.4h
 424         uaddw           v5.4s,  v5.4s,  v1.4h
 425         ld1             {v2.4h},   [x0], x1
 426         ld1             {v3.4h},   [x0], x1
 427         sqxtun          v0.4h,  v4.4s
 428         sqxtun2         v0.8h,  v5.4s
 429         sub             x0,  x0,  x1, lsl #2
 430
 431         uaddw           v6.4s,  v6.4s,  v2.4h
 432         umin            v0.8h,  v0.8h,  v31.8h
 433         uaddw           v7.4s,  v7.4s,  v3.4h
 434         st1             {v0.4h},   [x0], x1
 435         sqxtun          v2.4h,  v6.4s
 436         sqxtun2         v2.8h,  v7.4s
 437         umin            v2.8h,  v2.8h,  v31.8h
 438
 439         st1             {v0.d}[1], [x0], x1
 440         st1             {v2.4h},   [x0], x1
 441         st1             {v2.d}[1], [x0], x1
 442
 443         ret
 444 endfunc
 445 .endm
 446
 447 .macro itxfm_funcs4x4 bpp
 448 itxfm_func4x4 idct,  idct,  \bpp
 449 itxfm_func4x4 iadst, idct,  \bpp
 450 itxfm_func4x4 idct,  iadst, \bpp
 451 itxfm_func4x4 iadst, iadst, \bpp
 452 itxfm_func4x4 iwht,  iwht,  \bpp
 453 .endm
 454
 455 itxfm_funcs4x4 10
 456 itxfm_funcs4x4 12
 457
 458 function idct8x8_dc_add_neon
 459         movrel          x4,  idct_coeffs
 460         ld1             {v0.4h}, [x4]
 461
 462         movi            v1.4h,  #0
 463         sxtl            v0.4s,  v0.4h
 464
 465         ld1             {v2.s}[0],  [x2]
 466         smull           v2.2d,  v2.2s,  v0.s[0]
 467         rshrn           v2.2s,  v2.2d,  #14
 468         smull           v2.2d,  v2.2s,  v0.s[0]
 469         rshrn           v2.2s,  v2.2d,  #14
 470         st1             {v1.s}[0],  [x2]
 471         dup             v2.4s,  v2.s[0]
 472
 473         srshr           v2.4s,  v2.4s,  #5
 474
 475         mov             x4,  #8
 476         mov             x3,  x0
 477         dup             v31.8h, w5
 478 1:
 479         // Loop to add the constant from v2 into all 8x8 outputs
 480         subs            x4,  x4,  #2
 481         ld1             {v3.8h},  [x0], x1
 482         ld1             {v4.8h},  [x0], x1
 483         uaddw           v16.4s, v2.4s,  v3.4h
 484         uaddw2          v17.4s, v2.4s,  v3.8h
 485         uaddw           v18.4s, v2.4s,  v4.4h
 486         uaddw2          v19.4s, v2.4s,  v4.8h
 487         sqxtun          v3.4h,  v16.4s
 488         sqxtun2         v3.8h,  v17.4s
 489         sqxtun          v4.4h,  v18.4s
 490         sqxtun2         v4.8h,  v19.4s
 491         umin            v3.8h,  v3.8h,  v31.8h
 492         umin            v4.8h,  v4.8h,  v31.8h
 493         st1             {v3.8h},  [x3], x1
 494         st1             {v4.8h},  [x3], x1
 495         b.ne            1b
 496
 497         ret
 498 endfunc
 499
 500 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
 501         dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
 502         dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
 503         dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
 504         dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
 505
 506         butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
 507         butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
 508         butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
 509         butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
 510
 511         dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
 512
 513         butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
 514         butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
 515         butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
 516         butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
 517 .endm
 518
 519 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
 520         dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
 521         dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
 522
 523         dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
 524         dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
 525
 526         dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
 527         dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
 528
 529         dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
 530         dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
 531
 532         butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
 533         neg             \r7\().4s, \r7\().4s // r7 = out[7]
 534         butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
 535
 536         dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
 537         dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
 538
 539         dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
 540
 541         dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
 542         neg             \r3\().4s, \r3\().4s  // r3 = out[3]
 543
 544         dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
 545         neg             \r1\().4s, \r1\().4s  // r1 = out[1]
 546
 547         dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
 548         neg             \r5\().4s, \r5\().4s  // r5 = out[5]
 549 .endm
 550
 551
 552 .macro itxfm_func8x8 txfm1, txfm2
 553 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
 554 .ifc \txfm1\()_\txfm2,idct_idct
 555         cmp             w3,  #1
 556         b.eq            idct8x8_dc_add_neon
 557 .endif
 558         // The iadst also uses a few coefficients from
 559         // idct, so those always need to be loaded.
 560 .ifc \txfm1\()_\txfm2,idct_idct
 561         movrel          x4,  idct_coeffs
 562 .else
 563         movrel          x4,  iadst8_coeffs
 564         ld1             {v1.8h}, [x4], #16
 565         stp             d8,  d9,  [sp, #-0x10]!
 566         sxtl2           v3.4s,  v1.8h
 567         sxtl            v2.4s,  v1.4h
 568 .endif
 569         ld1             {v0.8h}, [x4]
 570         sxtl2           v1.4s,  v0.8h
 571         sxtl            v0.4s,  v0.4h
 572
 573         movi            v4.4s, #0
 574         movi            v5.4s, #0
 575         movi            v6.4s, #0
 576         movi            v7.4s, #0
 577
 578 1:
 579         ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
 580         ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
 581         ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
 582         ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
 583         sub             x2,  x2,  #256
 584         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 585         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 586         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 587         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
 588
 589 .ifc \txfm1\()_\txfm2,idct_idct
 590         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
 591         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
 592 .else
 593         \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
 594         \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
 595 .endif
 596
 597         // Transpose 8x8 with 16 bit elements
 598         transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
 599
 600 .ifc \txfm1\()_\txfm2,idct_idct
 601         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
 602         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
 603 .else
 604         \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
 605         \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
 606 .endif
 607 2:
 608         mov             x3,  x0
 609         // Add into the destination
 610         ld1             {v0.8h},  [x0], x1
 611         srshr           v16.4s, v16.4s, #5
 612         srshr           v17.4s, v17.4s, #5
 613         ld1             {v1.8h},  [x0], x1
 614         srshr           v18.4s, v18.4s, #5
 615         srshr           v19.4s, v19.4s, #5
 616         ld1             {v2.8h},  [x0], x1
 617         srshr           v20.4s, v20.4s, #5
 618         srshr           v21.4s, v21.4s, #5
 619         uaddw           v16.4s, v16.4s, v0.4h
 620         uaddw2          v17.4s, v17.4s, v0.8h
 621         ld1             {v3.8h},  [x0], x1
 622         srshr           v22.4s, v22.4s, #5
 623         srshr           v23.4s, v23.4s, #5
 624         uaddw           v18.4s, v18.4s, v1.4h
 625         uaddw2          v19.4s, v19.4s, v1.8h
 626         ld1             {v4.8h},  [x0], x1
 627         srshr           v24.4s, v24.4s, #5
 628         srshr           v25.4s, v25.4s, #5
 629         uaddw           v20.4s, v20.4s, v2.4h
 630         uaddw2          v21.4s, v21.4s, v2.8h
 631         sqxtun          v0.4h,  v16.4s
 632         sqxtun2         v0.8h,  v17.4s
 633         dup             v16.8h, w5
 634         ld1             {v5.8h},  [x0], x1
 635         srshr           v26.4s, v26.4s, #5
 636         srshr           v27.4s, v27.4s, #5
 637         uaddw           v22.4s, v22.4s, v3.4h
 638         uaddw2          v23.4s, v23.4s, v3.8h
 639         sqxtun          v1.4h,  v18.4s
 640         sqxtun2         v1.8h,  v19.4s
 641         umin            v0.8h,  v0.8h,  v16.8h
 642         ld1             {v6.8h},  [x0], x1
 643         srshr           v28.4s, v28.4s, #5
 644         srshr           v29.4s, v29.4s, #5
 645         uaddw           v24.4s, v24.4s, v4.4h
 646         uaddw2          v25.4s, v25.4s, v4.8h
 647         sqxtun          v2.4h,  v20.4s
 648         sqxtun2         v2.8h,  v21.4s
 649         umin            v1.8h,  v1.8h,  v16.8h
 650         ld1             {v7.8h},  [x0], x1
 651         srshr           v30.4s, v30.4s, #5
 652         srshr           v31.4s, v31.4s, #5
 653         uaddw           v26.4s, v26.4s, v5.4h
 654         uaddw2          v27.4s, v27.4s, v5.8h
 655         sqxtun          v3.4h,  v22.4s
 656         sqxtun2         v3.8h,  v23.4s
 657         umin            v2.8h,  v2.8h,  v16.8h
 658
 659         st1             {v0.8h},  [x3], x1
 660         uaddw           v28.4s, v28.4s, v6.4h
 661         uaddw2          v29.4s, v29.4s, v6.8h
 662         st1             {v1.8h},  [x3], x1
 663         sqxtun          v4.4h,  v24.4s
 664         sqxtun2         v4.8h,  v25.4s
 665         umin            v3.8h,  v3.8h,  v16.8h
 666         st1             {v2.8h},  [x3], x1
 667         uaddw           v30.4s, v30.4s, v7.4h
 668         uaddw2          v31.4s, v31.4s, v7.8h
 669         st1             {v3.8h},  [x3], x1
 670         sqxtun          v5.4h,  v26.4s
 671         sqxtun2         v5.8h,  v27.4s
 672         umin            v4.8h,  v4.8h,  v16.8h
 673         st1             {v4.8h},  [x3], x1
 674         sqxtun          v6.4h,  v28.4s
 675         sqxtun2         v6.8h,  v29.4s
 676         umin            v5.8h,  v5.8h,  v16.8h
 677         st1             {v5.8h},  [x3], x1
 678         sqxtun          v7.4h,  v30.4s
 679         sqxtun2         v7.8h,  v31.4s
 680         umin            v6.8h,  v6.8h,  v16.8h
 681
 682         st1             {v6.8h},  [x3], x1
 683         umin            v7.8h,  v7.8h,  v16.8h
 684         st1             {v7.8h},  [x3], x1
 685
 686 .ifnc \txfm1\()_\txfm2,idct_idct
 687         ldp             d8,  d9,  [sp], 0x10
 688 .endif
 689         ret
 690 endfunc
 691
 692 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
 693         mov             x5,  #0x03ff
 694         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
 695 endfunc
 696
 697 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
 698         mov             x5,  #0x0fff
 699         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
 700 endfunc
 701 .endm
 702
 703 itxfm_func8x8 idct,  idct
 704 itxfm_func8x8 iadst, idct
 705 itxfm_func8x8 idct,  iadst
 706 itxfm_func8x8 iadst, iadst
 707
 708
 709 function idct16x16_dc_add_neon
 710         movrel          x4,  idct_coeffs
 711         ld1             {v0.4h}, [x4]
 712         sxtl            v0.4s,  v0.4h
 713
 714         movi            v1.4h,  #0
 715
 716         ld1             {v2.s}[0],  [x2]
 717         smull           v2.2d,  v2.2s,  v0.s[0]
 718         rshrn           v2.2s,  v2.2d,  #14
 719         smull           v2.2d,  v2.2s,  v0.s[0]
 720         rshrn           v2.2s,  v2.2d,  #14
 721         st1             {v1.s}[0],  [x2]
 722         dup             v2.4s,  v2.s[0]
 723
 724         srshr           v0.4s,  v2.4s,  #6
 725
 726         mov             x3, x0
 727         mov             x4, #16
 728         dup             v31.8h, w13
 729 1:
 730         // Loop to add the constant from v2 into all 16x16 outputs
 731         subs            x4,  x4,  #2
 732         ld1             {v1.8h,v2.8h},  [x0], x1
 733         uaddw           v16.4s, v0.4s,  v1.4h
 734         uaddw2          v17.4s, v0.4s,  v1.8h
 735         ld1             {v3.8h,v4.8h},  [x0], x1
 736         uaddw           v18.4s, v0.4s,  v2.4h
 737         uaddw2          v19.4s, v0.4s,  v2.8h
 738         uaddw           v20.4s, v0.4s,  v3.4h
 739         uaddw2          v21.4s, v0.4s,  v3.8h
 740         uaddw           v22.4s, v0.4s,  v4.4h
 741         uaddw2          v23.4s, v0.4s,  v4.8h
 742         sqxtun          v1.4h,  v16.4s
 743         sqxtun2         v1.8h,  v17.4s
 744         sqxtun          v2.4h,  v18.4s
 745         sqxtun2         v2.8h,  v19.4s
 746         sqxtun          v3.4h,  v20.4s
 747         sqxtun2         v3.8h,  v21.4s
 748         sqxtun          v4.4h,  v22.4s
 749         sqxtun2         v4.8h,  v23.4s
 750         umin            v1.8h,  v1.8h,  v31.8h
 751         umin            v2.8h,  v2.8h,  v31.8h
 752         st1             {v1.8h,v2.8h},  [x3], x1
 753         umin            v3.8h,  v3.8h,  v31.8h
 754         umin            v4.8h,  v4.8h,  v31.8h
 755         st1             {v3.8h,v4.8h},  [x3], x1
 756         b.ne            1b
 757
 758         ret
 759 endfunc
 760
 761 .macro idct16_end
 762         butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
 763         butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
 764         butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
 765         butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
 766         butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
 767         butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
 768         butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
 769         butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
 770
 771         dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
 772         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
 773
 774         butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
 775         butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
 776         butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
 777         butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
 778         butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
 779         butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
 780         butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
 781         butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
 782         ret
 783 .endm
 784
 785 function idct16
 786         dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
 787         dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
 788         dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
 789         dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
 790         dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
 791         dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
 792         dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
 793         dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
 794
 795         butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 796         butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 797         butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 798         butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 799         butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 800         butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 801         butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 802         butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 803
 804         dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 805         dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 806         dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 807         idct16_end
 808 endfunc
 809
 810 function idct16_half
 811         dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
 812         dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
 813         dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
 814         dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
 815         dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
 816         dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
 817         dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
 818         dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
 819
 820         butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
 821         butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
 822         butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
 823         butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
 824         butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
 825         butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
 826         butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
 827         butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
 828
 829         dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
 830         dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
 831         dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
 832         idct16_end
 833 endfunc
 834
 835 function idct16_quarter
 836         dsmull_h        v24, v25, v19, v3.s[3]
 837         dsmull_h        v4,  v5,  v17, v2.s[0]
 838         dsmull_h        v7,  v6,  v18, v1.s[1]
 839         dsmull_h        v30, v31, v18, v1.s[0]
 840         neg             v24.2d,  v24.2d
 841         neg             v25.2d,  v25.2d
 842         dsmull_h        v29, v28, v17, v2.s[1]
 843         dsmull_h        v26, v27, v19, v3.s[2]
 844         dsmull_h        v22, v23, v16, v0.s[0]
 845         drshrn_h        v24, v24, v25, #14
 846         drshrn_h        v16, v4,  v5,  #14
 847         drshrn_h        v7,  v7,  v6,  #14
 848         drshrn_h        v6,  v30, v31, #14
 849         drshrn_h        v29, v29, v28, #14
 850         drshrn_h        v17, v26, v27, #14
 851         drshrn_h        v28, v22, v23, #14
 852
 853         dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
 854         dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
 855         neg             v22.2d,  v22.2d
 856         neg             v23.2d,  v23.2d
 857         drshrn_h        v27, v20, v21, #14
 858         drshrn_h        v21, v22, v23, #14
 859         drshrn_h        v23, v18, v19, #14
 860         drshrn_h        v25, v30, v31, #14
 861         mov             v4.16b,  v28.16b
 862         mov             v5.16b,  v28.16b
 863         dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
 864         mov             v20.16b, v28.16b
 865         idct16_end
 866 endfunc
 867
 868 function iadst16
 869         ld1             {v0.8h,v1.8h}, [x11]
 870         sxtl            v2.4s,  v1.4h
 871         sxtl2           v3.4s,  v1.8h
 872         sxtl2           v1.4s,  v0.8h
 873         sxtl            v0.4s,  v0.4h
 874
 875         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
 876         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
 877         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
 878         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
 879         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
 880
 881         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
 882         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
 883         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
 884         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
 885
 886         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
 887         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
 888         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
 889         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
 890
 891         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
 892         ld1             {v0.8h}, [x10]
 893         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
 894         sxtl2           v1.4s,  v0.8h
 895         sxtl            v0.4s,  v0.4h
 896         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
 897         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
 898
 899         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
 900         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
 901         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
 902         butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
 903         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
 904
 905         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
 906         butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
 907         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
 908         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
 909
 910         butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
 911         butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
 912
 913         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
 914         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
 915
 916         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
 917         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
 918         neg             v29.4s, v29.4s                   // v29 = out[13]
 919
 920         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
 921         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
 922
 923         butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
 924         butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
 925
 926         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
 927         neg             v19.4s, v19.4s                   // v19 = out[3]
 928         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
 929
 930         butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
 931         butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
 932
 933         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
 934         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
 935         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
 936         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
 937
 938         neg             v31.4s,  v5.4s                    // v31 = out[15]
 939         neg             v17.4s,  v3.4s                    // v17 = out[1]
 940
 941         mov             v16.16b, v2.16b
 942         mov             v30.16b, v4.16b
 943         ret
 944 endfunc
 945
 946 // Helper macros; we can't use these expressions directly within
 947 // e.g. .irp due to the extra concatenation \(). Therefore wrap
 948 // them in macros to allow using .irp below.
 949 .macro load i, src, inc
 950         ld1             {v\i\().4s},  [\src], \inc
 951 .endm
 952 .macro store i, dst, inc
 953         st1             {v\i\().4s},  [\dst], \inc
 954 .endm
 955 .macro movi_v i, size, imm
 956         movi            v\i\()\size,  \imm
 957 .endm
 958 .macro load_clear i, src, inc
 959         ld1             {v\i\().4s}, [\src]
 960         st1             {v4.4s},  [\src], \inc
 961 .endm
 962
 963 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
 964         srshr           \coef0, \coef0, #6
 965         ld1             {v4.4h},   [x0], x1
 966         srshr           \coef1, \coef1, #6
 967         ld1             {v4.d}[1], [x3], x1
 968         srshr           \coef2, \coef2, #6
 969         ld1             {v5.4h},   [x0], x1
 970         srshr           \coef3, \coef3, #6
 971         uaddw           \coef0, \coef0, v4.4h
 972         ld1             {v5.d}[1], [x3], x1
 973         srshr           \coef4, \coef4, #6
 974         uaddw2          \coef1, \coef1, v4.8h
 975         ld1             {v6.4h},   [x0], x1
 976         srshr           \coef5, \coef5, #6
 977         uaddw           \coef2, \coef2, v5.4h
 978         ld1             {v6.d}[1], [x3], x1
 979         sqxtun          v4.4h,  \coef0
 980         srshr           \coef6, \coef6, #6
 981         uaddw2          \coef3, \coef3, v5.8h
 982         ld1             {v7.4h},   [x0], x1
 983         sqxtun2         v4.8h,  \coef1
 984         srshr           \coef7, \coef7, #6
 985         uaddw           \coef4, \coef4, v6.4h
 986         ld1             {v7.d}[1], [x3], x1
 987         umin            v4.8h,  v4.8h,  v8.8h
 988         sub             x0,  x0,  x1, lsl #2
 989         sub             x3,  x3,  x1, lsl #2
 990         sqxtun          v5.4h,  \coef2
 991         uaddw2          \coef5, \coef5, v6.8h
 992         st1             {v4.4h},   [x0], x1
 993         sqxtun2         v5.8h,  \coef3
 994         uaddw           \coef6, \coef6, v7.4h
 995         st1             {v4.d}[1], [x3], x1
 996         umin            v5.8h,  v5.8h,  v8.8h
 997         sqxtun          v6.4h,  \coef4
 998         uaddw2          \coef7, \coef7, v7.8h
 999         st1             {v5.4h},   [x0], x1
1000         sqxtun2         v6.8h,  \coef5
1001         st1             {v5.d}[1], [x3], x1
1002         umin            v6.8h,  v6.8h,  v8.8h
1003         sqxtun          v7.4h,  \coef6
1004         st1             {v6.4h},   [x0], x1
1005         sqxtun2         v7.8h,  \coef7
1006         st1             {v6.d}[1], [x3], x1
1007         umin            v7.8h,  v7.8h,  v8.8h
1008         st1             {v7.4h},   [x0], x1
1009         st1             {v7.d}[1], [x3], x1
1010 .endm
1011
1012 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1013 // transpose into a horizontal 16x4 slice and store.
1014 // x0 = dst (temp buffer)
1015 // x1 = slice offset
1016 // x2 = src
1017 // x9 = input stride
1018 .macro itxfm16_1d_funcs txfm
1019 function \txfm\()16_1d_4x16_pass1_neon
1020         mov             x14, x30
1021
1022         movi            v4.4s, #0
1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024         load_clear      \i,  x2,  x9
1025 .endr
1026
1027         bl              \txfm\()16
1028
1029         // Do four 4x4 transposes. Originally, v16-v31 contain the
1030         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1031         // contain the four transposed 4x4 blocks.
1032         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1033         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1034         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1035         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1036
1037         // Store the transposed 4x4 blocks horizontally.
1038         cmp             x1,  #12
1039         b.eq            1f
1040 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1041         store           \i,  x0,  #16
1042 .endr
1043         br              x14
1044 1:
1045         // Special case: For the last input column (x1 == 12),
1046         // which would be stored as the last row in the temp buffer,
1047         // don't store the first 4x4 block, but keep it in registers
1048         // for the first slice of the second pass (where it is the
1049         // last 4x4 block).
1050         add             x0,  x0,  #16
1051         st1             {v20.4s},  [x0], #16
1052         st1             {v24.4s},  [x0], #16
1053         st1             {v28.4s},  [x0], #16
1054         add             x0,  x0,  #16
1055         st1             {v21.4s},  [x0], #16
1056         st1             {v25.4s},  [x0], #16
1057         st1             {v29.4s},  [x0], #16
1058         add             x0,  x0,  #16
1059         st1             {v22.4s},  [x0], #16
1060         st1             {v26.4s},  [x0], #16
1061         st1             {v30.4s},  [x0], #16
1062         add             x0,  x0,  #16
1063         st1             {v23.4s},  [x0], #16
1064         st1             {v27.4s},  [x0], #16
1065         st1             {v31.4s},  [x0], #16
1066
1067         mov             v28.16b, v16.16b
1068         mov             v29.16b, v17.16b
1069         mov             v30.16b, v18.16b
1070         mov             v31.16b, v19.16b
1071         br              x14
1072 endfunc
1073
1074 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1075 // load the destination pixels (from a similar 4x16 slice), add and store back.
1076 // x0 = dst
1077 // x1 = dst stride
1078 // x2 = src (temp buffer)
1079 // x3 = slice offset
1080 // x9 = temp buffer stride
1081 function \txfm\()16_1d_4x16_pass2_neon
1082         mov             x14, x30
1083
1084 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
1085         load            \i,  x2,  x9
1086 .endr
1087         cbz             x3,  1f
1088 .irp i, 28, 29, 30, 31
1089         load            \i,  x2,  x9
1090 .endr
1091 1:
1092
1093         add             x3,  x0,  x1
1094         lsl             x1,  x1,  #1
1095         bl              \txfm\()16
1096
1097         dup             v8.8h, w13
1098         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1099         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1100
1101         br              x14
1102 endfunc
1103 .endm
1104
1105 itxfm16_1d_funcs idct
1106 itxfm16_1d_funcs iadst
1107
1108 // This is the minimum eob value for each subpartition, in increments of 4
1109 const min_eob_idct_idct_16, align=4
1110         .short  0, 10, 38, 89
1111 endconst
1112
1113 .macro itxfm_func16x16 txfm1, txfm2
1114 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1115 .ifc \txfm1\()_\txfm2,idct_idct
1116         cmp             w3,  #1
1117         b.eq            idct16x16_dc_add_neon
1118 .endif
1119         mov             x15, x30
1120         // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1121 .ifnc \txfm1\()_\txfm2,idct_idct
1122         stp             d14, d15, [sp, #-0x10]!
1123         stp             d12, d13, [sp, #-0x10]!
1124         stp             d10, d11, [sp, #-0x10]!
1125 .endif
1126         stp             d8,  d9,  [sp, #-0x10]!
1127
1128         sub             sp,  sp,  #1024
1129
1130         mov             x4,  x0
1131         mov             x5,  x1
1132         mov             x6,  x2
1133
1134         movrel          x10, idct_coeffs
1135 .ifnc \txfm1\()_\txfm2,idct_idct
1136         movrel          x11, iadst16_coeffs
1137 .endif
1138 .ifc \txfm1,idct
1139         ld1             {v0.8h,v1.8h}, [x10]
1140         sxtl            v2.4s,  v1.4h
1141         sxtl2           v3.4s,  v1.8h
1142         sxtl2           v1.4s,  v0.8h
1143         sxtl            v0.4s,  v0.4h
1144 .endif
1145         mov             x9,  #64
1146
1147 .ifc \txfm1\()_\txfm2,idct_idct
1148         cmp             w3,  #10
1149         b.le            idct16x16_quarter_add_16_neon
1150         cmp             w3,  #38
1151         b.le            idct16x16_half_add_16_neon
1152
1153         movrel          x12, min_eob_idct_idct_16, 2
1154 .endif
1155
1156 .irp i, 0, 4, 8, 12
1157         add             x0,  sp,  #(\i*64)
1158 .ifc \txfm1\()_\txfm2,idct_idct
1159 .if \i > 0
1160         ldrh            w1,  [x12], #2
1161         cmp             w3,  w1
1162         mov             x1,  #(16 - \i)/4
1163         b.le            1f
1164 .endif
1165 .endif
1166         mov             x1,  #\i
1167         add             x2,  x6,  #(\i*4)
1168         bl              \txfm1\()16_1d_4x16_pass1_neon
1169 .endr
1170 .ifc \txfm1\()_\txfm2,iadst_idct
1171         ld1             {v0.8h,v1.8h}, [x10]
1172         sxtl            v2.4s,  v1.4h
1173         sxtl2           v3.4s,  v1.8h
1174         sxtl2           v1.4s,  v0.8h
1175         sxtl            v0.4s,  v0.4h
1176 .endif
1177
1178 .ifc \txfm1\()_\txfm2,idct_idct
1179         b               3f
1180 1:
1181         // Set v28-v31 to zero, for the in-register passthrough of
1182         // coefficients to pass 2.
1183         movi            v28.4s,  #0
1184         movi            v29.4s,  #0
1185         movi            v30.4s,  #0
1186         movi            v31.4s,  #0
1187 2:
1188         subs            x1,  x1,  #1
1189 .rept 4
1190         st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1191 .endr
1192         b.ne            2b
1193 3:
1194 .endif
1195
1196 .irp i, 0, 4, 8, 12
1197         add             x0,  x4,  #(\i*2)
1198         mov             x1,  x5
1199         add             x2,  sp,  #(\i*4)
1200         mov             x3,  #\i
1201         bl              \txfm2\()16_1d_4x16_pass2_neon
1202 .endr
1203
1204         add             sp,  sp,  #1024
1205         ldp             d8,  d9,  [sp], 0x10
1206 .ifnc \txfm1\()_\txfm2,idct_idct
1207         ldp             d10, d11, [sp], 0x10
1208         ldp             d12, d13, [sp], 0x10
1209         ldp             d14, d15, [sp], 0x10
1210 .endif
1211         br              x15
1212 endfunc
1213
1214 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1215         mov             x13, #0x03ff
1216         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1217 endfunc
1218
1219 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1220         mov             x13, #0x0fff
1221         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1222 endfunc
1223 .endm
1224
1225 itxfm_func16x16 idct,  idct
1226 itxfm_func16x16 iadst, idct
1227 itxfm_func16x16 idct,  iadst
1228 itxfm_func16x16 iadst, iadst
1229
1230 function idct16_1d_4x16_pass1_quarter_neon
1231         mov             x14, x30
1232
1233         movi            v4.4s, #0
1234 .irp i, 16, 17, 18, 19
1235         load_clear      \i,  x2,  x9
1236 .endr
1237
1238         bl              idct16_quarter
1239
1240         // Do four 4x4 transposes. Originally, v16-v31 contain the
1241         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1242         // contain the four transposed 4x4 blocks.
1243         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1244         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1245         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1246         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1247
1248         // Store the transposed 4x4 blocks horizontally.
1249         // The first 4x4 block is kept in registers for the second pass,
1250         // store the rest in the temp buffer.
1251         add             x0,  x0,  #16
1252         st1             {v20.4s},  [x0], #16
1253         st1             {v24.4s},  [x0], #16
1254         st1             {v28.4s},  [x0], #16
1255         add             x0,  x0,  #16
1256         st1             {v21.4s},  [x0], #16
1257         st1             {v25.4s},  [x0], #16
1258         st1             {v29.4s},  [x0], #16
1259         add             x0,  x0,  #16
1260         st1             {v22.4s},  [x0], #16
1261         st1             {v26.4s},  [x0], #16
1262         st1             {v30.4s},  [x0], #16
1263         add             x0,  x0,  #16
1264         st1             {v23.4s},  [x0], #16
1265         st1             {v27.4s},  [x0], #16
1266         st1             {v31.4s},  [x0], #16
1267         br              x14
1268 endfunc
1269
1270 function idct16_1d_4x16_pass2_quarter_neon
1271         mov             x14, x30
1272
1273         // Only load the top 4 lines, and only do it for the later slices.
1274         // For the first slice, d16-d19 is kept in registers from the first pass.
1275         cbz             x3,  1f
1276 .irp i, 16, 17, 18, 19
1277         load            \i,  x2,  x9
1278 .endr
1279 1:
1280
1281         add             x3,  x0,  x1
1282         lsl             x1,  x1,  #1
1283         bl              idct16_quarter
1284
1285         dup             v8.8h, w13
1286         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1287         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1288
1289         br              x14
1290 endfunc
1291
1292 function idct16_1d_4x16_pass1_half_neon
1293         mov             x14, x30
1294
1295         movi            v4.4s, #0
1296 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1297         load_clear      \i,  x2,  x9
1298 .endr
1299
1300         bl              idct16_half
1301
1302         // Do four 4x4 transposes. Originally, v16-v31 contain the
1303         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1304         // contain the four transposed 4x4 blocks.
1305         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1306         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1307         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1308         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1309
1310         // Store the transposed 4x4 blocks horizontally.
1311         cmp             x1,  #4
1312         b.eq            1f
1313 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1314         store           \i,  x0,  #16
1315 .endr
1316         br              x14
1317 1:
1318         // Special case: For the second input column (r1 == 4),
1319         // which would be stored as the second row in the temp buffer,
1320         // don't store the first 4x4 block, but keep it in registers
1321         // for the first slice of the second pass (where it is the
1322         // second 4x4 block).
1323         add             x0,  x0,  #16
1324         st1             {v20.4s},  [x0], #16
1325         st1             {v24.4s},  [x0], #16
1326         st1             {v28.4s},  [x0], #16
1327         add             x0,  x0,  #16
1328         st1             {v21.4s},  [x0], #16
1329         st1             {v25.4s},  [x0], #16
1330         st1             {v29.4s},  [x0], #16
1331         add             x0,  x0,  #16
1332         st1             {v22.4s},  [x0], #16
1333         st1             {v26.4s},  [x0], #16
1334         st1             {v30.4s},  [x0], #16
1335         add             x0,  x0,  #16
1336         st1             {v23.4s},  [x0], #16
1337         st1             {v27.4s},  [x0], #16
1338         st1             {v31.4s},  [x0], #16
1339
1340         mov             v20.16b, v16.16b
1341         mov             v21.16b, v17.16b
1342         mov             v22.16b, v18.16b
1343         mov             v23.16b, v19.16b
1344         br              x14
1345 endfunc
1346
1347 function idct16_1d_4x16_pass2_half_neon
1348         mov             x14, x30
1349
1350 .irp i, 16, 17, 18, 19
1351         load            \i,  x2,  x9
1352 .endr
1353         cbz             x3,  1f
1354 .irp i, 20, 21, 22, 23
1355         load            \i,  x2,  x9
1356 .endr
1357 1:
1358
1359         add             x3,  x0,  x1
1360         lsl             x1,  x1,  #1
1361         bl              idct16_half
1362
1363         dup             v8.8h, w13
1364         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1365         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1366
1367         br              x14
1368 endfunc
1369
1370 .macro idct16_partial size
1371 function idct16x16_\size\()_add_16_neon
1372         add             x0,  sp,  #(0*64)
1373         mov             x1,  #0
1374         add             x2,  x6,  #(0*4)
1375         bl              idct16_1d_4x16_pass1_\size\()_neon
1376 .ifc \size,half
1377         add             x0,  sp,  #(4*64)
1378         mov             x1,  #4
1379         add             x2,  x6,  #(4*4)
1380         bl              idct16_1d_4x16_pass1_\size\()_neon
1381 .endif
1382
1383 .irp i, 0, 4, 8, 12
1384         add             x0,  x4,  #(\i*2)
1385         mov             x1,  x5
1386         add             x2,  sp,  #(\i*4)
1387         mov             x3,  #\i
1388         bl              idct16_1d_4x16_pass2_\size\()_neon
1389 .endr
1390
1391         add             sp,  sp,  #1024
1392         ldp             d8,  d9,  [sp], 0x10
1393         br              x15
1394 endfunc
1395 .endm
1396
1397 idct16_partial quarter
1398 idct16_partial half
1399
1400 function idct32x32_dc_add_neon
1401         movrel          x4,  idct_coeffs
1402         ld1             {v0.4h}, [x4]
1403         sxtl            v0.4s,  v0.4h
1404
1405         movi            v1.4h,  #0
1406
1407         ld1             {v2.s}[0],  [x2]
1408         smull           v2.2d,  v2.2s,  v0.s[0]
1409         rshrn           v2.2s,  v2.2d,  #14
1410         smull           v2.2d,  v2.2s,  v0.s[0]
1411         rshrn           v2.2s,  v2.2d,  #14
1412         st1             {v1.s}[0],  [x2]
1413         dup             v2.4s,  v2.s[0]
1414
1415         srshr           v0.4s,  v2.4s,  #6
1416
1417         mov             x3,  x0
1418         mov             x4,  #32
1419         sub             x1,  x1,  #32
1420         dup             v31.8h, w13
1421 1:
1422         // Loop to add the constant v0 into all 32x32 outputs
1423         subs            x4,  x4,  #1
1424         ld1             {v1.8h,v2.8h},  [x0], #32
1425         uaddw           v16.4s, v0.4s,  v1.4h
1426         uaddw2          v17.4s, v0.4s,  v1.8h
1427         ld1             {v3.8h,v4.8h},  [x0], x1
1428         uaddw           v18.4s, v0.4s,  v2.4h
1429         uaddw2          v19.4s, v0.4s,  v2.8h
1430         uaddw           v20.4s, v0.4s,  v3.4h
1431         uaddw2          v21.4s, v0.4s,  v3.8h
1432         uaddw           v22.4s, v0.4s,  v4.4h
1433         uaddw2          v23.4s, v0.4s,  v4.8h
1434         sqxtun          v1.4h,  v16.4s
1435         sqxtun2         v1.8h,  v17.4s
1436         sqxtun          v2.4h,  v18.4s
1437         sqxtun2         v2.8h,  v19.4s
1438         sqxtun          v3.4h,  v20.4s
1439         sqxtun2         v3.8h,  v21.4s
1440         sqxtun          v4.4h,  v22.4s
1441         sqxtun2         v4.8h,  v23.4s
1442         umin            v1.8h,  v1.8h,  v31.8h
1443         umin            v2.8h,  v2.8h,  v31.8h
1444         st1             {v1.8h,v2.8h},  [x3], #32
1445         umin            v3.8h,  v3.8h,  v31.8h
1446         umin            v4.8h,  v4.8h,  v31.8h
1447         st1             {v3.8h,v4.8h},  [x3], x1
1448         b.ne            1b
1449
1450         ret
1451 endfunc
1452
1453 .macro idct32_end
1454         butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1455         butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1456         butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1457         butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1458         butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1459         butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1460         butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
1461         butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1462
1463         dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1464         dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1465         dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1466         dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1467
1468         butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1469         butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1470         butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1471         butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1472         butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1473         butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1474         butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1475         butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
1476
1477         dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
1478         dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1479         dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
1480         dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1481         ret
1482 .endm
1483
1484 function idct32_odd
1485         dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1486         dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1487         dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1488         dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1489         dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1490         dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1491         dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1492         dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1493
1494         butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1495         butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1496         butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1497         butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1498         butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1499         butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1500         butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1501         butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1502
1503         dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1504         dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1505         dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1506         dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1507         idct32_end
1508 endfunc
1509
1510 function idct32_odd_half
1511         dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1512         dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1513         dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1514         dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1515         dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1516         dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1517         dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1518         dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1519
1520         butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1521         butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1522         butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1523         butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1524         butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1525         butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1526         butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1527         butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1528
1529         dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1530         dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1531         dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1532         dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1533         idct32_end
1534 endfunc
1535
1536 function idct32_odd_quarter
1537         dsmull_h        v4,  v5,  v16, v10.s[0]
1538         dsmull_h        v28, v29, v19, v11.s[3]
1539         dsmull_h        v30, v31, v16, v10.s[1]
1540         dsmull_h        v22, v23, v17, v13.s[2]
1541         dsmull_h        v7,  v6,  v17, v13.s[3]
1542         dsmull_h        v26, v27, v19, v11.s[2]
1543         dsmull_h        v20, v21, v18, v12.s[0]
1544         dsmull_h        v24, v25, v18, v12.s[1]
1545
1546         neg             v28.2d, v28.2d
1547         neg             v29.2d, v29.2d
1548         neg             v7.2d,  v7.2d
1549         neg             v6.2d,  v6.2d
1550
1551         drshrn_h        v4,  v4,  v5,  #14
1552         drshrn_h        v5,  v28, v29, #14
1553         drshrn_h        v29, v30, v31, #14
1554         drshrn_h        v28, v22, v23, #14
1555         drshrn_h        v7,  v7,  v6,  #14
1556         drshrn_h        v31, v26, v27, #14
1557         drshrn_h        v6,  v20, v21, #14
1558         drshrn_h        v30, v24, v25, #14
1559
1560         dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
1561         dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
1562         drshrn_h        v23, v16, v17, #14
1563         drshrn_h        v24, v18, v19, #14
1564         neg             v20.2d, v20.2d
1565         neg             v21.2d, v21.2d
1566         drshrn_h        v27, v27, v26, #14
1567         drshrn_h        v20, v20, v21, #14
1568         dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
1569         drshrn_h        v21, v16, v17, #14
1570         drshrn_h        v26, v18, v19, #14
1571         dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
1572         drshrn_h        v25, v16, v17, #14
1573         neg             v18.2d, v18.2d
1574         neg             v19.2d, v19.2d
1575         drshrn_h        v22, v18, v19, #14
1576
1577         idct32_end
1578 endfunc
1579
1580 .macro idct32_funcs suffix
1581 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1582 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1583 // a normal IDCT16 with every other input component (the even ones, with
1584 // each output written twice), followed by a separate 16-point IDCT
1585 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1586 // x0 = dst (temp buffer)
1587 // x1 = unused
1588 // x2 = src
1589 // x9 = double input stride
1590 function idct32_1d_4x32_pass1\suffix\()_neon
1591         mov             x14, x30
1592
1593         movi            v4.4s,  #0
1594
1595         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1596 .ifb \suffix
1597 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1598         load_clear      \i, x2, x9
1599 .endr
1600 .endif
1601 .ifc \suffix,_quarter
1602 .irp i, 16, 17, 18, 19
1603         load_clear      \i, x2, x9
1604 .endr
1605 .endif
1606 .ifc \suffix,_half
1607 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1608         load_clear      \i, x2, x9
1609 .endr
1610 .endif
1611
1612         bl              idct16\suffix
1613
1614         // Do four 4x4 transposes. Originally, v16-v31 contain the
1615         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1616         // contain the four transposed 4x4 blocks.
1617         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1618         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1619         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1620         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1621
1622         // Store the registers a, b, c, d horizontally, followed by the
1623         // same registers d, c, b, a mirrored.
1624 .macro store_rev a, b, c, d
1625         // There's no rev128 instruction, but we reverse each 64 bit
1626         // half, and then flip them using an ext with 8 bytes offset.
1627         rev64           v7.4s, \d
1628         st1             {\a},  [x0], #16
1629         ext             v7.16b, v7.16b, v7.16b, #8
1630         st1             {\b},  [x0], #16
1631         rev64           v6.4s, \c
1632         st1             {\c},  [x0], #16
1633         ext             v6.16b, v6.16b, v6.16b, #8
1634         st1             {\d},  [x0], #16
1635         rev64           v5.4s, \b
1636         st1             {v7.4s},  [x0], #16
1637         ext             v5.16b, v5.16b, v5.16b, #8
1638         st1             {v6.4s},  [x0], #16
1639         rev64           v4.4s, \a
1640         st1             {v5.4s},  [x0], #16
1641         ext             v4.16b, v4.16b, v4.16b, #8
1642         st1             {v4.4s},  [x0], #16
1643 .endm
1644         store_rev       v16.4s, v20.4s, v24.4s, v28.4s
1645         store_rev       v17.4s, v21.4s, v25.4s, v29.4s
1646         store_rev       v18.4s, v22.4s, v26.4s, v30.4s
1647         store_rev       v19.4s, v23.4s, v27.4s, v31.4s
1648         sub             x0,  x0,  #512
1649 .purgem store_rev
1650
1651         // Move x2 back to the start of the input, and move
1652         // to the first odd row
1653 .ifb \suffix
1654         sub             x2,  x2,  x9, lsl #4
1655 .endif
1656 .ifc \suffix,_quarter
1657         sub             x2,  x2,  x9, lsl #2
1658 .endif
1659 .ifc \suffix,_half
1660         sub             x2,  x2,  x9, lsl #3
1661 .endif
1662         add             x2,  x2,  #128
1663
1664         movi            v4.4s,  #0
1665         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1666 .ifb \suffix
1667 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1668         load_clear      \i, x2, x9
1669 .endr
1670 .endif
1671 .ifc \suffix,_quarter
1672 .irp i, 16, 17, 18, 19
1673         load_clear      \i, x2, x9
1674 .endr
1675 .endif
1676 .ifc \suffix,_half
1677 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1678         load_clear      \i, x2, x9
1679 .endr
1680 .endif
1681
1682         bl              idct32_odd\suffix
1683
1684         transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
1685         transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
1686         transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
1687         transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
1688
1689         // Store the registers a, b, c, d horizontally,
1690         // adding into the output first, and the mirrored,
1691         // subtracted from the output.
1692 .macro store_rev a, b, c, d, a16b, b16b
1693         ld1             {v4.4s},  [x0]
1694         rev64           v9.4s, \d
1695         add             v4.4s, v4.4s, \a
1696         st1             {v4.4s},  [x0], #16
1697         rev64           v8.4s, \c
1698         ld1             {v4.4s},  [x0]
1699         ext             v9.16b, v9.16b, v9.16b, #8
1700         add             v4.4s, v4.4s, \b
1701         st1             {v4.4s},  [x0], #16
1702         ext             v8.16b, v8.16b, v8.16b, #8
1703         ld1             {v4.4s},  [x0]
1704         rev64           \b, \b
1705         add             v4.4s, v4.4s, \c
1706         st1             {v4.4s},  [x0], #16
1707         rev64           \a, \a
1708         ld1             {v4.4s},  [x0]
1709         ext             \b16b, \b16b, \b16b, #8
1710         add             v4.4s, v4.4s, \d
1711         st1             {v4.4s},  [x0], #16
1712         ext             \a16b, \a16b, \a16b, #8
1713         ld1             {v4.4s},  [x0]
1714         sub             v4.4s, v4.4s, v9.4s
1715         st1             {v4.4s},  [x0], #16
1716         ld1             {v4.4s},  [x0]
1717         sub             v4.4s, v4.4s, v8.4s
1718         st1             {v4.4s},  [x0], #16
1719         ld1             {v4.4s},  [x0]
1720         sub             v4.4s, v4.4s, \b
1721         st1             {v4.4s},  [x0], #16
1722         ld1             {v4.4s},  [x0]
1723         sub             v4.4s, v4.4s, \a
1724         st1             {v4.4s},  [x0], #16
1725 .endm
1726
1727         store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
1728         store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
1729         store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
1730         store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
1731 .purgem store_rev
1732         br              x14
1733 endfunc
1734
1735 // This is mostly the same as 4x32_pass1, but without the transpose,
1736 // and use the source as temp buffer between the two idct passes, and
1737 // add into the destination.
1738 // x0 = dst
1739 // x1 = dst stride
1740 // x2 = src (temp buffer)
1741 // x7 = negative double temp buffer stride
1742 // x9 = double temp buffer stride
1743 function idct32_1d_4x32_pass2\suffix\()_neon
1744         mov             x14, x30
1745
1746         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1747 .ifb \suffix
1748 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1749         load            \i, x2, x9
1750 .endr
1751         sub             x2,  x2,  x9, lsl #4
1752 .endif
1753 .ifc \suffix,_quarter
1754 .irp i, 16, 17, 18, 19
1755         load            \i, x2, x9
1756 .endr
1757         sub             x2,  x2,  x9, lsl #2
1758 .endif
1759 .ifc \suffix,_half
1760 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1761         load            \i, x2, x9
1762 .endr
1763         sub             x2,  x2,  x9, lsl #3
1764 .endif
1765
1766         bl              idct16\suffix
1767
1768 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1769         store           \i, x2, x9
1770 .endr
1771
1772         sub             x2,  x2,  x9, lsl #4
1773         add             x2,  x2,  #128
1774
1775         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1776 .ifb \suffix
1777 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1778         load            \i, x2, x9
1779 .endr
1780         sub             x2,  x2,  x9, lsl #4
1781 .endif
1782 .ifc \suffix,_quarter
1783 .irp i, 16, 17, 18, 19
1784         load            \i, x2, x9
1785 .endr
1786         sub             x2,  x2,  x9, lsl #2
1787 .endif
1788 .ifc \suffix,_half
1789 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1790         load            \i, x2, x9
1791 .endr
1792         sub             x2,  x2,  x9, lsl #3
1793 .endif
1794         sub             x2,  x2,  #128
1795
1796         bl              idct32_odd\suffix
1797
1798 .macro load_acc_store a, b, c, d, neg=0
1799 .if \neg == 0
1800         ld1             {v4.4s},  [x2], x9
1801         ld1             {v5.4s},  [x2], x9
1802         add             v4.4s, v4.4s, \a
1803         ld1             {v6.4s},  [x2], x9
1804         add             v5.4s, v5.4s, \b
1805         ld1             {v7.4s},  [x2], x9
1806         add             v6.4s, v6.4s, \c
1807         add             v7.4s, v7.4s, \d
1808 .else
1809         ld1             {v4.4s},  [x2], x7
1810         ld1             {v5.4s},  [x2], x7
1811         sub             v4.4s, v4.4s, \a
1812         ld1             {v6.4s},  [x2], x7
1813         sub             v5.4s, v5.4s, \b
1814         ld1             {v7.4s},  [x2], x7
1815         sub             v6.4s, v6.4s, \c
1816         sub             v7.4s, v7.4s, \d
1817 .endif
1818         ld1             {v8.4h},   [x0], x1
1819         ld1             {v8.d}[1], [x0], x1
1820         srshr           v4.4s, v4.4s, #6
1821         ld1             {v9.4h},   [x0], x1
1822         srshr           v5.4s, v5.4s, #6
1823         uaddw           v4.4s, v4.4s, v8.4h
1824         ld1             {v9.d}[1], [x0], x1
1825         srshr           v6.4s, v6.4s, #6
1826         uaddw2          v5.4s, v5.4s, v8.8h
1827         srshr           v7.4s, v7.4s, #6
1828         sub             x0,  x0,  x1, lsl #2
1829         uaddw           v6.4s, v6.4s, v9.4h
1830         sqxtun          v4.4h, v4.4s
1831         uaddw2          v7.4s, v7.4s, v9.8h
1832         sqxtun2         v4.8h, v5.4s
1833         umin            v4.8h, v4.8h, v15.8h
1834         st1             {v4.4h},   [x0], x1
1835         sqxtun          v5.4h, v6.4s
1836         st1             {v4.d}[1], [x0], x1
1837         sqxtun2         v5.8h, v7.4s
1838         umin            v5.8h, v5.8h, v15.8h
1839         st1             {v5.4h},   [x0], x1
1840         st1             {v5.d}[1], [x0], x1
1841 .endm
1842         load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
1843         load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
1844         load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
1845         load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
1846         sub             x2,  x2,  x9
1847         load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
1848         load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
1849         load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
1850         load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
1851 .purgem load_acc_store
1852         br              x14
1853 endfunc
1854 .endm
1855
1856 idct32_funcs
1857 idct32_funcs _quarter
1858 idct32_funcs _half
1859
1860 const min_eob_idct_idct_32, align=4
1861         .short  0, 9, 34, 70, 135, 240, 336, 448
1862 endconst
1863
1864 function vp9_idct_idct_32x32_add_16_neon
1865         cmp             w3,  #1
1866         b.eq            idct32x32_dc_add_neon
1867
1868         movrel          x10, idct_coeffs
1869
1870         mov             x15, x30
1871         stp             d8,  d9,  [sp, #-0x10]!
1872         stp             d10, d11, [sp, #-0x10]!
1873         stp             d12, d13, [sp, #-0x10]!
1874         stp             d14, d15, [sp, #-0x10]!
1875
1876         sub             sp,  sp,  #4096
1877
1878         mov             x4,  x0
1879         mov             x5,  x1
1880         mov             x6,  x2
1881
1882         // Double stride of the input, since we only read every other line
1883         mov             x9,  #256
1884         neg             x7,  x9
1885
1886         ld1             {v0.8h,v1.8h},   [x10], #32
1887         sxtl            v2.4s,  v1.4h
1888         sxtl2           v3.4s,  v1.8h
1889         sxtl2           v1.4s,  v0.8h
1890         sxtl            v0.4s,  v0.4h
1891         ld1             {v10.8h,v11.8h}, [x10]
1892         sxtl            v12.4s, v11.4h
1893         sxtl2           v13.4s, v11.8h
1894         sxtl2           v11.4s, v10.8h
1895         sxtl            v10.4s, v10.4h
1896
1897         dup             v15.8h, w13
1898
1899         cmp             w3,  #34
1900         b.le            idct32x32_quarter_add_16_neon
1901         cmp             w3,  #135
1902         b.le            idct32x32_half_add_16_neon
1903
1904         movrel          x12, min_eob_idct_idct_32, 2
1905
1906 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1907         add             x0,  sp,  #(\i*128)
1908 .if \i > 0
1909         ldrh            w1,  [x12], #2
1910         cmp             w3,  w1
1911         mov             x1,  #(32 - \i)/4
1912         b.le            1f
1913 .endif
1914         add             x2,  x6,  #(\i*4)
1915         bl              idct32_1d_4x32_pass1_neon
1916 .endr
1917         b               3f
1918
1919 1:
1920         // Write zeros to the temp buffer for pass 2
1921         movi            v16.4s,  #0
1922         movi            v17.4s,  #0
1923         movi            v18.4s,  #0
1924         movi            v19.4s,  #0
1925 2:
1926         subs            x1,  x1,  #1
1927 .rept 4
1928         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1929         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1930 .endr
1931         b.ne            2b
1932 3:
1933 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1934         add             x0,  x4,  #(\i*2)
1935         mov             x1,  x5
1936         add             x2,  sp,  #(\i*4)
1937         bl              idct32_1d_4x32_pass2_neon
1938 .endr
1939
1940         add             sp,  sp,  #4096
1941         ldp             d14, d15, [sp], 0x10
1942         ldp             d12, d13, [sp], 0x10
1943         ldp             d10, d11, [sp], 0x10
1944         ldp             d8,  d9,  [sp], 0x10
1945
1946         br              x15
1947 endfunc
1948
1949 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1950         mov             x13, #0x03ff
1951         b               vp9_idct_idct_32x32_add_16_neon
1952 endfunc
1953
1954 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1955         mov             x13, #0x0fff
1956         b               vp9_idct_idct_32x32_add_16_neon
1957 endfunc
1958
1959 .macro idct32_partial size
1960 function idct32x32_\size\()_add_16_neon
1961 .irp i, 0, 4
1962         add             x0,  sp,  #(\i*128)
1963 .ifc \size,quarter
1964 .if \i == 4
1965         cmp             w3,  #9
1966         b.le            1f
1967 .endif
1968 .endif
1969         add             x2,  x6,  #(\i*4)
1970         bl              idct32_1d_4x32_pass1_\size\()_neon
1971 .endr
1972
1973 .ifc \size,half
1974 .irp i, 8, 12
1975         add             x0,  sp,  #(\i*128)
1976 .if \i == 12
1977         cmp             w3,  #70
1978         b.le            1f
1979 .endif
1980         add             x2,  x6,  #(\i*4)
1981         bl              idct32_1d_4x32_pass1_\size\()_neon
1982 .endr
1983 .endif
1984         b               3f
1985
1986 1:
1987         // Write zeros to the temp buffer for pass 2
1988         movi            v16.4s,  #0
1989         movi            v17.4s,  #0
1990         movi            v18.4s,  #0
1991         movi            v19.4s,  #0
1992
1993 .rept 4
1994         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1995         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1996 .endr
1997
1998 3:
1999 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
2000         add             x0,  x4,  #(\i*2)
2001         mov             x1,  x5
2002         add             x2,  sp,  #(\i*4)
2003         bl              idct32_1d_4x32_pass2_\size\()_neon
2004 .endr
2005
2006         add             sp,  sp,  #4096
2007         ldp             d14, d15, [sp], 0x10
2008         ldp             d12, d13, [sp], 0x10
2009         ldp             d10, d11, [sp], 0x10
2010         ldp             d8,  d9,  [sp], 0x10
2011
2012         br              x15
2013 endfunc
2014 .endm
2015
2016 idct32_partial quarter
2017 idct32_partial half