git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/hevcdsp_idct_neon.S

   1 /*
   2  * ARM NEON optimised IDCT functions for HEVC decoding
   3  * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
   4  * Copyright (c) 2017 Alexandra Hájková
   5  *
   6  * Ported from arm/hevcdsp_idct_neon.S by
   7  * Copyright (c) 2020 Reimar Döffinger
   8  * Copyright (c) 2020 Josh Dekker
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/aarch64/asm.S"
  28
  29 const trans, align=4
  30         .short 64, 83, 64, 36
  31         .short 89, 75, 50, 18
  32         .short 90, 87, 80, 70
  33         .short 57, 43, 25, 9
  34         .short 90, 90, 88, 85
  35         .short 82, 78, 73, 67
  36         .short 61, 54, 46, 38
  37         .short 31, 22, 13, 4
  38 endconst
  39
  40 .macro clip10 in1, in2, c1, c2
  41         smax        \in1, \in1, \c1
  42         smax        \in2, \in2, \c1
  43         smin        \in1, \in1, \c2
  44         smin        \in2, \in2, \c2
  45 .endm
  46
  47 function ff_hevc_add_residual_4x4_8_neon, export=1
  48         ld1             {v0.8h-v1.8h}, [x1]
  49         ld1             {v2.s}[0], [x0], x2
  50         ld1             {v2.s}[1], [x0], x2
  51         ld1             {v2.s}[2], [x0], x2
  52         ld1             {v2.s}[3], [x0], x2
  53         sub              x0,  x0,  x2, lsl #2
  54         uxtl             v6.8h,  v2.8b
  55         uxtl2            v7.8h,  v2.16b
  56         sqadd            v0.8h,  v0.8h, v6.8h
  57         sqadd            v1.8h,  v1.8h, v7.8h
  58         sqxtun           v0.8b,  v0.8h
  59         sqxtun2          v0.16b, v1.8h
  60         st1             {v0.s}[0], [x0], x2
  61         st1             {v0.s}[1], [x0], x2
  62         st1             {v0.s}[2], [x0], x2
  63         st1             {v0.s}[3], [x0], x2
  64         ret
  65 endfunc
  66
  67 function ff_hevc_add_residual_4x4_10_neon, export=1
  68         mov             x12,  x0
  69         ld1             {v0.8h-v1.8h}, [x1]
  70         ld1             {v2.d}[0], [x12], x2
  71         ld1             {v2.d}[1], [x12], x2
  72         ld1             {v3.d}[0], [x12], x2
  73         sqadd            v0.8h, v0.8h, v2.8h
  74         ld1             {v3.d}[1], [x12], x2
  75         movi             v4.8h, #0
  76         sqadd            v1.8h, v1.8h, v3.8h
  77         mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
  78         clip10           v0.8h, v1.8h, v4.8h, v5.8h
  79         st1             {v0.d}[0],  [x0], x2
  80         st1             {v0.d}[1],  [x0], x2
  81         st1             {v1.d}[0],  [x0], x2
  82         st1             {v1.d}[1],  [x0], x2
  83         ret
  84 endfunc
  85
  86 function ff_hevc_add_residual_8x8_8_neon, export=1
  87         add             x12,  x0, x2
  88         add              x2,  x2, x2
  89         mov              x3,  #8
  90 1:      subs             x3,  x3, #2
  91         ld1             {v2.d}[0],     [x0]
  92         ld1             {v2.d}[1],    [x12]
  93         uxtl             v3.8h,  v2.8b
  94         ld1             {v0.8h-v1.8h}, [x1], #32
  95         uxtl2            v2.8h,  v2.16b
  96         sqadd            v0.8h,  v0.8h,   v3.8h
  97         sqadd            v1.8h,  v1.8h,   v2.8h
  98         sqxtun           v0.8b,  v0.8h
  99         sqxtun2          v0.16b, v1.8h
 100         st1             {v0.d}[0],     [x0], x2
 101         st1             {v0.d}[1],    [x12], x2
 102         bne              1b
 103         ret
 104 endfunc
 105
 106 function ff_hevc_add_residual_8x8_10_neon, export=1
 107         add             x12,  x0, x2
 108         add              x2,  x2, x2
 109         mov              x3,  #8
 110         movi             v4.8h, #0
 111         mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
 112 1:      subs             x3,  x3, #2
 113         ld1             {v0.8h-v1.8h}, [x1], #32
 114         ld1             {v2.8h},       [x0]
 115         sqadd            v0.8h, v0.8h, v2.8h
 116         ld1             {v3.8h},      [x12]
 117         sqadd            v1.8h, v1.8h, v3.8h
 118         clip10           v0.8h, v1.8h, v4.8h, v5.8h
 119         st1             {v0.8h},       [x0], x2
 120         st1             {v1.8h},      [x12], x2
 121         bne              1b
 122         ret
 123 endfunc
 124
 125 function ff_hevc_add_residual_16x16_8_neon, export=1
 126         mov              x3,  #16
 127         add             x12, x0, x2
 128         add              x2,  x2, x2
 129 1:      subs             x3,  x3, #2
 130         ld1             {v16.16b},     [x0]
 131         ld1             {v0.8h-v3.8h}, [x1], #64
 132         ld1             {v19.16b},    [x12]
 133         uxtl            v17.8h, v16.8b
 134         uxtl2           v18.8h, v16.16b
 135         uxtl            v20.8h, v19.8b
 136         uxtl2           v21.8h, v19.16b
 137         sqadd            v0.8h,  v0.8h, v17.8h
 138         sqadd            v1.8h,  v1.8h, v18.8h
 139         sqadd            v2.8h,  v2.8h, v20.8h
 140         sqadd            v3.8h,  v3.8h, v21.8h
 141         sqxtun           v0.8b,  v0.8h
 142         sqxtun2         v0.16b,  v1.8h
 143         sqxtun           v1.8b,  v2.8h
 144         sqxtun2         v1.16b,  v3.8h
 145         st1             {v0.16b},     [x0], x2
 146         st1             {v1.16b},    [x12], x2
 147         bne              1b
 148         ret
 149 endfunc
 150
 151 function ff_hevc_add_residual_16x16_10_neon, export=1
 152         mov              x3,  #16
 153         movi            v20.8h, #0
 154         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
 155         add             x12,  x0, x2
 156         add              x2,  x2, x2
 157 1:      subs             x3,  x3, #2
 158         ld1             {v16.8h-v17.8h}, [x0]
 159         ld1             {v0.8h-v3.8h},  [x1], #64
 160         sqadd            v0.8h, v0.8h, v16.8h
 161         ld1             {v18.8h-v19.8h}, [x12]
 162         sqadd            v1.8h, v1.8h, v17.8h
 163         sqadd            v2.8h, v2.8h, v18.8h
 164         sqadd            v3.8h, v3.8h, v19.8h
 165         clip10           v0.8h, v1.8h, v20.8h, v21.8h
 166         clip10           v2.8h, v3.8h, v20.8h, v21.8h
 167         st1             {v0.8h-v1.8h},   [x0], x2
 168         st1             {v2.8h-v3.8h},  [x12], x2
 169         bne              1b
 170         ret
 171 endfunc
 172
 173 function ff_hevc_add_residual_32x32_8_neon, export=1
 174         add             x12,  x0, x2
 175         add              x2,  x2, x2
 176         mov              x3,  #32
 177 1:      subs             x3,  x3, #2
 178         ld1             {v20.16b, v21.16b}, [x0]
 179         uxtl            v16.8h,  v20.8b
 180         uxtl2           v17.8h,  v20.16b
 181         ld1             {v22.16b, v23.16b}, [x12]
 182         uxtl            v18.8h,  v21.8b
 183         uxtl2           v19.8h,  v21.16b
 184         uxtl            v20.8h,  v22.8b
 185         ld1             {v0.8h-v3.8h}, [x1], #64
 186         ld1             {v4.8h-v7.8h}, [x1], #64
 187         uxtl2           v21.8h,  v22.16b
 188         uxtl            v22.8h,  v23.8b
 189         uxtl2           v23.8h,  v23.16b
 190         sqadd            v0.8h,  v0.8h,  v16.8h
 191         sqadd            v1.8h,  v1.8h,  v17.8h
 192         sqadd            v2.8h,  v2.8h,  v18.8h
 193         sqadd            v3.8h,  v3.8h,  v19.8h
 194         sqadd            v4.8h,  v4.8h,  v20.8h
 195         sqadd            v5.8h,  v5.8h,  v21.8h
 196         sqadd            v6.8h,  v6.8h,  v22.8h
 197         sqadd            v7.8h,  v7.8h,  v23.8h
 198         sqxtun           v0.8b,  v0.8h
 199         sqxtun2         v0.16b,  v1.8h
 200         sqxtun           v1.8b,  v2.8h
 201         sqxtun2         v1.16b,  v3.8h
 202         sqxtun           v2.8b,  v4.8h
 203         sqxtun2         v2.16b,  v5.8h
 204         st1             {v0.16b, v1.16b},  [x0], x2
 205         sqxtun           v3.8b,  v6.8h
 206         sqxtun2         v3.16b,  v7.8h
 207         st1             {v2.16b, v3.16b}, [x12], x2
 208         bne              1b
 209         ret
 210 endfunc
 211
 212 function ff_hevc_add_residual_32x32_10_neon, export=1
 213         mov              x3,  #32
 214         movi            v20.8h, #0
 215         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
 216 1:      subs             x3,  x3, #1
 217         ld1             {v0.8h-v3.8h},   [x1], #64
 218         ld1             {v16.8h-v19.8h}, [x0]
 219         sqadd            v0.8h, v0.8h, v16.8h
 220         sqadd            v1.8h, v1.8h, v17.8h
 221         sqadd            v2.8h, v2.8h, v18.8h
 222         sqadd            v3.8h, v3.8h, v19.8h
 223         clip10           v0.8h, v1.8h, v20.8h, v21.8h
 224         clip10           v2.8h, v3.8h, v20.8h, v21.8h
 225         st1             {v0.8h-v3.8h},   [x0], x2
 226         bne              1b
 227         ret
 228 endfunc
 229
 230 .macro sum_sub out, in, c, op, p
 231   .ifc \op, +
 232         smlal\p         \out, \in, \c
 233   .else
 234         smlsl\p         \out, \in, \c
 235   .endif
 236 .endm
 237
 238 .macro fixsqrshrn d, dt, n, m
 239   .ifc \dt, .8h
 240         sqrshrn2        \d\dt, \n\().4s, \m
 241   .else
 242         sqrshrn         \n\().4h, \n\().4s, \m
 243         mov             \d\().d[0], \n\().d[0]
 244   .endif
 245 .endm
 246
 247 // uses and clobbers v28-v31 as temp registers
 248 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
 249          sshll\p1       v28.4s, \in0, #6
 250          mov            v29.16b, v28.16b
 251          smull\p1       v30.4s, \in1, v0.h[1]
 252          smull\p1       v31.4s, \in1, v0.h[3]
 253          smlal\p2       v28.4s, \in2, v0.h[0] //e0
 254          smlsl\p2       v29.4s, \in2, v0.h[0] //e1
 255          smlal\p2       v30.4s, \in3, v0.h[3] //o0
 256          smlsl\p2       v31.4s, \in3, v0.h[1] //o1
 257
 258          add            \out0, v28.4s, v30.4s
 259          add            \out1, v29.4s, v31.4s
 260          sub            \out2, v29.4s, v31.4s
 261          sub            \out3, v28.4s, v30.4s
 262 .endm
 263
 264 .macro transpose8_4x4 r0, r1, r2, r3
 265         trn1            v2.8h, \r0\().8h, \r1\().8h
 266         trn2            v3.8h, \r0\().8h, \r1\().8h
 267         trn1            v4.8h, \r2\().8h, \r3\().8h
 268         trn2            v5.8h, \r2\().8h, \r3\().8h
 269         trn1            \r0\().4s, v2.4s, v4.4s
 270         trn2            \r2\().4s, v2.4s, v4.4s
 271         trn1            \r1\().4s, v3.4s, v5.4s
 272         trn2            \r3\().4s, v3.4s, v5.4s
 273 .endm
 274
 275 .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
 276         transpose8_4x4  \r0, \r1, \r2, \r3
 277         transpose8_4x4  \r4, \r5, \r6, \r7
 278 .endm
 279
 280 .macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2
 281         tr_4x4_8        \in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4s, v25.4s, v26.4s, v27.4s, \p1, \p2
 282
 283         smull\p1        v30.4s, \in1\in1t, v0.h[6]
 284         smull\p1        v28.4s, \in1\in1t, v0.h[4]
 285         smull\p1        v29.4s, \in1\in1t, v0.h[5]
 286         sum_sub         v30.4s, \in3\in3t, v0.h[4], -, \p1
 287         sum_sub         v28.4s, \in3\in3t, v0.h[5], +, \p1
 288         sum_sub         v29.4s, \in3\in3t, v0.h[7], -, \p1
 289
 290         sum_sub         v30.4s, \in5\in5t, v0.h[7], +, \p2
 291         sum_sub         v28.4s, \in5\in5t, v0.h[6], +, \p2
 292         sum_sub         v29.4s, \in5\in5t, v0.h[4], -, \p2
 293
 294         sum_sub         v30.4s, \in7\in7t, v0.h[5], +, \p2
 295         sum_sub         v28.4s, \in7\in7t, v0.h[7], +, \p2
 296         sum_sub         v29.4s, \in7\in7t, v0.h[6], -, \p2
 297
 298         add             v31.4s, v26.4s, v30.4s
 299         sub             v26.4s, v26.4s, v30.4s
 300         fixsqrshrn      \in2,\in2t, v31, \shift
 301
 302
 303         smull\p1        v31.4s, \in1\in1t, v0.h[7]
 304         sum_sub         v31.4s, \in3\in3t, v0.h[6], -, \p1
 305         sum_sub         v31.4s, \in5\in5t, v0.h[5], +, \p2
 306         sum_sub         v31.4s, \in7\in7t, v0.h[4], -, \p2
 307         fixsqrshrn      \in5,\in5t, v26, \shift
 308
 309
 310         add             v26.4s, v24.4s, v28.4s
 311         sub             v24.4s, v24.4s, v28.4s
 312         add             v28.4s, v25.4s, v29.4s
 313         sub             v25.4s, v25.4s, v29.4s
 314         add             v30.4s, v27.4s, v31.4s
 315         sub             v27.4s, v27.4s, v31.4s
 316
 317         fixsqrshrn      \in0,\in0t, v26, \shift
 318         fixsqrshrn      \in7,\in7t, v24, \shift
 319         fixsqrshrn      \in1,\in1t, v28, \shift
 320         fixsqrshrn      \in6,\in6t, v25, \shift
 321         fixsqrshrn      \in3,\in3t, v30, \shift
 322         fixsqrshrn      \in4,\in4t, v27, \shift
 323 .endm
 324
 325 .macro idct_8x8 bitdepth
 326 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 327 //x0 - coeffs
 328         mov              x1,  x0
 329         ld1             {v16.8h-v19.8h}, [x1], #64
 330         ld1             {v20.8h-v23.8h}, [x1]
 331
 332         movrel           x1, trans
 333         ld1             {v0.8h}, [x1]
 334
 335         tr_8x4          7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
 336         tr_8x4          7, v16,.8h, v17,.8h, v18,.8h, v19,.8h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, 2, 2
 337
 338         transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
 339
 340         tr_8x4          20 - \bitdepth, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v16,.8h, v17,.8h, v18,.8h, v19,.8h, , 2
 341         tr_8x4          20 - \bitdepth, v20,.4h, v21,.4h, v22,.4h, v23,.4h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, , 2
 342
 343         transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
 344
 345         mov              x1,  x0
 346         st1             {v16.8h-v19.8h}, [x1], #64
 347         st1             {v20.8h-v23.8h}, [x1]
 348
 349         ret
 350 endfunc
 351 .endm
 352
 353 .macro butterfly e, o, tmp_p, tmp_m
 354         add        \tmp_p, \e, \o
 355         sub        \tmp_m, \e, \o
 356 .endm
 357
 358 .macro tr16_8x4 in0, in1, in2, in3, offset
 359         tr_4x4_8        \in0\().4h, \in1\().4h, \in2\().4h, \in3\().4h, v24.4s, v25.4s, v26.4s, v27.4s
 360
 361         smull2          v28.4s, \in0\().8h, v0.h[4]
 362         smull2          v29.4s, \in0\().8h, v0.h[5]
 363         smull2          v30.4s, \in0\().8h, v0.h[6]
 364         smull2          v31.4s, \in0\().8h, v0.h[7]
 365         sum_sub         v28.4s, \in1\().8h, v0.h[5], +, 2
 366         sum_sub         v29.4s, \in1\().8h, v0.h[7], -, 2
 367         sum_sub         v30.4s, \in1\().8h, v0.h[4], -, 2
 368         sum_sub         v31.4s, \in1\().8h, v0.h[6], -, 2
 369
 370         sum_sub         v28.4s, \in2\().8h, v0.h[6], +, 2
 371         sum_sub         v29.4s, \in2\().8h, v0.h[4], -, 2
 372         sum_sub         v30.4s, \in2\().8h, v0.h[7], +, 2
 373         sum_sub         v31.4s, \in2\().8h, v0.h[5], +, 2
 374
 375         sum_sub         v28.4s, \in3\().8h, v0.h[7], +, 2
 376         sum_sub         v29.4s, \in3\().8h, v0.h[6], -, 2
 377         sum_sub         v30.4s, \in3\().8h, v0.h[5], +, 2
 378         sum_sub         v31.4s, \in3\().8h, v0.h[4], -, 2
 379
 380         butterfly       v24.4s, v28.4s, v16.4s, v23.4s
 381         butterfly       v25.4s, v29.4s, v17.4s, v22.4s
 382         butterfly       v26.4s, v30.4s, v18.4s, v21.4s
 383         butterfly       v27.4s, v31.4s, v19.4s, v20.4s
 384         add              x4,  sp,  #\offset
 385         st1             {v16.4s-v19.4s}, [x4], #64
 386         st1             {v20.4s-v23.4s}, [x4]
 387 .endm
 388
 389 .macro load16 in0, in1, in2, in3
 390         ld1             {\in0}[0], [x1], x2
 391         ld1             {\in0}[1], [x3], x2
 392         ld1             {\in1}[0], [x1], x2
 393         ld1             {\in1}[1], [x3], x2
 394         ld1             {\in2}[0], [x1], x2
 395         ld1             {\in2}[1], [x3], x2
 396         ld1             {\in3}[0], [x1], x2
 397         ld1             {\in3}[1], [x3], x2
 398 .endm
 399
 400 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
 401         sum_sub v21.4s, \in, \t0, \op0, \p
 402         sum_sub v22.4s, \in, \t1, \op1, \p
 403         sum_sub v23.4s, \in, \t2, \op2, \p
 404         sum_sub v24.4s, \in, \t3, \op3, \p
 405         sum_sub v25.4s, \in, \t4, \op4, \p
 406         sum_sub v26.4s, \in, \t5, \op5, \p
 407         sum_sub v27.4s, \in, \t6, \op6, \p
 408         sum_sub v28.4s, \in, \t7, \op7, \p
 409 .endm
 410
 411 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
 412         add             v20.4s, \in0, \in1
 413         sub             \in0, \in0, \in1
 414         add             \in1, \in2, \in3
 415         sub             \in2, \in2, \in3
 416         add             \in3, \in4, \in5
 417         sub             \in4, \in4, \in5
 418         add             \in5, \in6, \in7
 419         sub             \in6, \in6, \in7
 420 .endm
 421
 422 .macro store16 in0, in1, in2, in3, rx
 423         st1             {\in0}[0], [x1], x2
 424         st1             {\in0}[1], [x3], \rx
 425         st1             {\in1}[0], [x1], x2
 426         st1             {\in1}[1], [x3], \rx
 427         st1             {\in2}[0], [x1], x2
 428         st1             {\in2}[1], [x3], \rx
 429         st1             {\in3}[0], [x1], x2
 430         st1             {\in3}[1], [x3], \rx
 431 .endm
 432
 433 .macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
 434         sqrshrn         \out0\().4h, \in0, \shift
 435         sqrshrn2        \out0\().8h, \in1, \shift
 436         sqrshrn         \out1\().4h, \in2, \shift
 437         sqrshrn2        \out1\().8h, \in3, \shift
 438         sqrshrn         \out2\().4h, \in4, \shift
 439         sqrshrn2        \out2\().8h, \in5, \shift
 440         sqrshrn         \out3\().4h, \in6, \shift
 441         sqrshrn2        \out3\().8h, \in7, \shift
 442 .endm
 443
 444 .macro transpose16_4x4_2 r0, r1, r2, r3
 445         // lower halves
 446         trn1            v2.4h, \r0\().4h, \r1\().4h
 447         trn2            v3.4h, \r0\().4h, \r1\().4h
 448         trn1            v4.4h, \r2\().4h, \r3\().4h
 449         trn2            v5.4h, \r2\().4h, \r3\().4h
 450         trn1            v6.2s, v2.2s, v4.2s
 451         trn2            v7.2s, v2.2s, v4.2s
 452         trn1            v2.2s, v3.2s, v5.2s
 453         trn2            v4.2s, v3.2s, v5.2s
 454         mov             \r0\().d[0], v6.d[0]
 455         mov             \r2\().d[0], v7.d[0]
 456         mov             \r1\().d[0], v2.d[0]
 457         mov             \r3\().d[0], v4.d[0]
 458
 459         // upper halves in reverse order
 460         trn1            v2.8h, \r3\().8h, \r2\().8h
 461         trn2            v3.8h, \r3\().8h, \r2\().8h
 462         trn1            v4.8h, \r1\().8h, \r0\().8h
 463         trn2            v5.8h, \r1\().8h, \r0\().8h
 464         trn1            v6.4s, v2.4s, v4.4s
 465         trn2            v7.4s, v2.4s, v4.4s
 466         trn1            v2.4s, v3.4s, v5.4s
 467         trn2            v4.4s, v3.4s, v5.4s
 468         mov             \r3\().d[1], v6.d[1]
 469         mov             \r1\().d[1], v7.d[1]
 470         mov             \r2\().d[1], v2.d[1]
 471         mov             \r0\().d[1], v4.d[1]
 472 .endm
 473
 474 .macro tr_16x4 name, shift, offset, step
 475 function func_tr_16x4_\name
 476         mov              x1,  x5
 477         add              x3,  x5, #(\step * 64)
 478         mov              x2,  #(\step * 128)
 479         load16          v16.d, v17.d, v18.d, v19.d
 480         movrel           x1,  trans
 481         ld1             {v0.8h}, [x1]
 482
 483         tr16_8x4        v16, v17, v18, v19, \offset
 484
 485         add              x1,  x5, #(\step * 32)
 486         add              x3,  x5, #(\step * 3 *32)
 487         mov              x2,  #(\step * 128)
 488         load16          v20.d, v17.d, v18.d, v19.d
 489         movrel           x1, trans, 16
 490         ld1             {v1.8h}, [x1]
 491         smull           v21.4s, v20.4h, v1.h[0]
 492         smull           v22.4s, v20.4h, v1.h[1]
 493         smull           v23.4s, v20.4h, v1.h[2]
 494         smull           v24.4s, v20.4h, v1.h[3]
 495         smull           v25.4s, v20.4h, v1.h[4]
 496         smull           v26.4s, v20.4h, v1.h[5]
 497         smull           v27.4s, v20.4h, v1.h[6]
 498         smull           v28.4s, v20.4h, v1.h[7]
 499
 500         add_member      v20.8h, v1.h[1], v1.h[4], v1.h[7], v1.h[5], v1.h[2], v1.h[0], v1.h[3], v1.h[6], +, +, +, -, -, -, -, -, 2
 501         add_member      v17.4h, v1.h[2], v1.h[7], v1.h[3], v1.h[1], v1.h[6], v1.h[4], v1.h[0], v1.h[5], +, +, -, -, -, +, +, +
 502         add_member      v17.8h, v1.h[3], v1.h[5], v1.h[1], v1.h[7], v1.h[0], v1.h[6], v1.h[2], v1.h[4], +, -, -, +, +, +, -, -, 2
 503         add_member      v18.4h, v1.h[4], v1.h[2], v1.h[6], v1.h[0], v1.h[7], v1.h[1], v1.h[5], v1.h[3], +, -, -, +, -, -, +, +
 504         add_member      v18.8h, v1.h[5], v1.h[0], v1.h[4], v1.h[6], v1.h[1], v1.h[3], v1.h[7], v1.h[2], +, -, +, +, -, +, +, -, 2
 505         add_member      v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
 506         add_member      v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
 507
 508         add              x4, sp, #\offset
 509         ld1             {v16.4s-v19.4s}, [x4], #64
 510
 511         butterfly16     v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
 512         scale           v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
 513         transpose16_4x4_2 v29, v30, v31, v24
 514         mov              x1,  x6
 515         add              x3,  x6, #(24 +3*32)
 516         mov              x2, #32
 517         mov              x4, #-32
 518         store16         v29.d, v30.d, v31.d, v24.d, x4
 519
 520         add             x4, sp, #(\offset + 64)
 521         ld1             {v16.4s-v19.4s}, [x4]
 522         butterfly16     v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, v28.4s
 523         scale           v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
 524         transpose16_4x4_2 v29, v30, v31, v20
 525
 526         add              x1,  x6, #8
 527         add              x3,  x6, #(16 + 3 * 32)
 528         mov              x2, #32
 529         mov              x4, #-32
 530         store16         v29.d, v30.d, v31.d, v20.d, x4
 531
 532         ret
 533 endfunc
 534 .endm
 535
 536 .macro idct_16x16 bitdepth
 537 function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
 538 //r0 - coeffs
 539         mov             x15, x30
 540
 541         // allocate a temp buffer
 542         sub              sp,  sp,  #640
 543
 544 .irp i, 0, 1, 2, 3
 545         add              x5,  x0, #(8 * \i)
 546         add              x6,  sp, #(8 * \i * 16)
 547         bl              func_tr_16x4_firstpass
 548 .endr
 549
 550 .irp i, 0, 1, 2, 3
 551         add              x5,  sp, #(8 * \i)
 552         add              x6,  x0, #(8 * \i * 16)
 553         bl              func_tr_16x4_secondpass_\bitdepth
 554 .endr
 555
 556         add              sp,  sp,  #640
 557
 558         mov             x30, x15
 559         ret
 560 endfunc
 561 .endm
 562
 563 idct_8x8 8
 564 idct_8x8 10
 565
 566 tr_16x4 firstpass, 7, 512, 1
 567 tr_16x4 secondpass_8, 20 - 8, 512, 1
 568 tr_16x4 secondpass_10, 20 - 10, 512, 1
 569
 570 idct_16x16 8
 571 idct_16x16 10
 572
 573 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 574 .macro idct_dc size, bitdepth
 575 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
 576         movi          v1.8h,  #((1 << (14 - \bitdepth))+1)
 577         ld1r         {v4.8h}, [x0]
 578         add           v4.8h,  v4.8h,  v1.8h
 579         sshr          v0.8h,  v4.8h,  #(15 - \bitdepth)
 580         sshr          v1.8h,  v4.8h,  #(15 - \bitdepth)
 581 .if \size > 4
 582         sshr          v2.8h,  v4.8h,  #(15 - \bitdepth)
 583         sshr          v3.8h,  v4.8h,  #(15 - \bitdepth)
 584 .if \size > 16 /* dc 32x32 */
 585         mov              x2,  #4
 586 1:
 587         subs             x2,  x2, #1
 588 .endif
 589         add             x12,  x0, #64
 590         mov             x13,  #128
 591 .if \size > 8 /* dc 16x16 */
 592         st1            {v0.8h-v3.8h},  [x0], x13
 593         st1            {v0.8h-v3.8h}, [x12], x13
 594         st1            {v0.8h-v3.8h},  [x0], x13
 595         st1            {v0.8h-v3.8h}, [x12], x13
 596         st1            {v0.8h-v3.8h},  [x0], x13
 597         st1            {v0.8h-v3.8h}, [x12], x13
 598 .endif /* dc 8x8 */
 599         st1            {v0.8h-v3.8h},  [x0], x13
 600         st1            {v0.8h-v3.8h}, [x12], x13
 601 .if \size > 16 /* dc 32x32 */
 602         bne             1b
 603 .endif
 604 .else /* dc 4x4 */
 605         st1            {v0.8h-v1.8h},  [x0]
 606 .endif
 607         ret
 608 endfunc
 609 .endm
 610
 611 idct_dc 4, 8
 612 idct_dc 4, 10
 613
 614 idct_dc 8, 8
 615 idct_dc 8, 10
 616
 617 idct_dc 16, 8
 618 idct_dc 16, 10
 619
 620 idct_dc 32, 8
 621 idct_dc 32, 10