git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264dsp_neon.S

   1 /*
   2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
   3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/aarch64/asm.S"
  23 #include "neon.S"
  24
  25 .macro  h264_loop_filter_start
  26         cmp             w2,  #0
  27         ldr             w6,  [x4]
  28         ccmp            w3,  #0, #0, ne
  29         mov             v24.S[0], w6
  30         and             w6,  w6,  w6,  lsl #16
  31         b.eq            1f
  32         ands            w6,  w6,  w6,  lsl #8
  33         b.ge            2f
  34 1:
  35         ret
  36 2:
  37 .endm
  38
  39 .macro  h264_loop_filter_luma
  40         dup             v22.16B, w2                     // alpha
  41         uxtl            v24.8H,  v24.8B
  42         uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
  43         uxtl            v24.4S,  v24.4H
  44         uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
  45         sli             v24.8H,  v24.8H,  #8
  46         uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
  47         sli             v24.4S,  v24.4S,  #16
  48         cmhi            v21.16B, v22.16B, v21.16B       // < alpha
  49         dup             v22.16B, w3                     // beta
  50         cmlt            v23.16B, v24.16B, #0
  51         cmhi            v28.16B, v22.16B, v28.16B       // < beta
  52         cmhi            v30.16B, v22.16B, v30.16B       // < beta
  53         bic             v21.16B, v21.16B, v23.16B
  54         uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
  55         and             v21.16B, v21.16B, v28.16B
  56         uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
  57         cmhi            v17.16B, v22.16B, v17.16B       // < beta
  58         and             v21.16B, v21.16B, v30.16B
  59         cmhi            v19.16B, v22.16B, v19.16B       // < beta
  60         and             v17.16B, v17.16B, v21.16B
  61         and             v19.16B, v19.16B, v21.16B
  62         and             v24.16B, v24.16B, v21.16B
  63         urhadd          v28.16B, v16.16B,  v0.16B
  64         sub             v21.16B, v24.16B, v17.16B
  65         uqadd           v23.16B, v18.16B, v24.16B
  66         uhadd           v20.16B, v20.16B, v28.16B
  67         sub             v21.16B, v21.16B, v19.16B
  68         uhadd           v28.16B,  v4.16B, v28.16B
  69         umin            v23.16B, v23.16B, v20.16B
  70         uqsub           v22.16B, v18.16B, v24.16B
  71         uqadd           v4.16B,   v2.16B, v24.16B
  72         umax            v23.16B, v23.16B, v22.16B
  73         uqsub           v22.16B,  v2.16B, v24.16B
  74         umin            v28.16B,  v4.16B, v28.16B
  75         uxtl            v4.8H,    v0.8B
  76         umax            v28.16B, v28.16B, v22.16B
  77         uxtl2           v20.8H,   v0.16B
  78         usubw           v4.8H,    v4.8H,  v16.8B
  79         usubw2          v20.8H,  v20.8H,  v16.16B
  80         shl             v4.8H,    v4.8H,  #2
  81         shl             v20.8H,  v20.8H,  #2
  82         uaddw           v4.8H,    v4.8H,  v18.8B
  83         uaddw2          v20.8H,  v20.8H,  v18.16B
  84         usubw           v4.8H,    v4.8H,   v2.8B
  85         usubw2          v20.8H,  v20.8H,   v2.16B
  86         rshrn           v4.8B,    v4.8H,  #3
  87         rshrn2          v4.16B,  v20.8H,  #3
  88         bsl             v17.16B, v23.16B, v18.16B
  89         bsl             v19.16B, v28.16B,  v2.16B
  90         neg             v23.16B, v21.16B
  91         uxtl            v28.8H,  v16.8B
  92         smin            v4.16B,   v4.16B, v21.16B
  93         uxtl2           v21.8H,  v16.16B
  94         smax            v4.16B,   v4.16B, v23.16B
  95         uxtl            v22.8H,   v0.8B
  96         uxtl2           v24.8H,   v0.16B
  97         saddw           v28.8H,  v28.8H,  v4.8B
  98         saddw2          v21.8H,  v21.8H,  v4.16B
  99         ssubw           v22.8H,  v22.8H,  v4.8B
 100         ssubw2          v24.8H,  v24.8H,  v4.16B
 101         sqxtun          v16.8B,  v28.8H
 102         sqxtun2         v16.16B, v21.8H
 103         sqxtun          v0.8B,   v22.8H
 104         sqxtun2         v0.16B,  v24.8H
 105 .endm
 106
 107 function ff_h264_v_loop_filter_luma_neon, export=1
 108         h264_loop_filter_start
 109         sxtw            x1,  w1
 110
 111         ld1             {v0.16B},  [x0], x1
 112         ld1             {v2.16B},  [x0], x1
 113         ld1             {v4.16B},  [x0], x1
 114         sub             x0,  x0,  x1, lsl #2
 115         sub             x0,  x0,  x1, lsl #1
 116         ld1             {v20.16B},  [x0], x1
 117         ld1             {v18.16B},  [x0], x1
 118         ld1             {v16.16B},  [x0], x1
 119
 120         h264_loop_filter_luma
 121
 122         sub             x0,  x0,  x1, lsl #1
 123         st1             {v17.16B},  [x0], x1
 124         st1             {v16.16B}, [x0], x1
 125         st1             {v0.16B},  [x0], x1
 126         st1             {v19.16B}, [x0]
 127
 128         ret
 129 endfunc
 130
 131 function ff_h264_h_loop_filter_luma_neon, export=1
 132         h264_loop_filter_start
 133
 134         sub             x0,  x0,  #4
 135         ld1             {v6.8B},  [x0], x1
 136         ld1             {v20.8B}, [x0], x1
 137         ld1             {v18.8B}, [x0], x1
 138         ld1             {v16.8B}, [x0], x1
 139         ld1             {v0.8B},  [x0], x1
 140         ld1             {v2.8B},  [x0], x1
 141         ld1             {v4.8B},  [x0], x1
 142         ld1             {v26.8B}, [x0], x1
 143         ld1             {v6.D}[1],  [x0], x1
 144         ld1             {v20.D}[1], [x0], x1
 145         ld1             {v18.D}[1], [x0], x1
 146         ld1             {v16.D}[1], [x0], x1
 147         ld1             {v0.D}[1],  [x0], x1
 148         ld1             {v2.D}[1],  [x0], x1
 149         ld1             {v4.D}[1],  [x0], x1
 150         ld1             {v26.D}[1], [x0], x1
 151
 152         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
 153
 154         h264_loop_filter_luma
 155
 156         transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
 157
 158         sub             x0,  x0,  x1, lsl #4
 159         add             x0,  x0,  #2
 160         st1             {v17.S}[0],  [x0], x1
 161         st1             {v16.S}[0], [x0], x1
 162         st1             {v0.S}[0],  [x0], x1
 163         st1             {v19.S}[0], [x0], x1
 164         st1             {v17.S}[1],  [x0], x1
 165         st1             {v16.S}[1], [x0], x1
 166         st1             {v0.S}[1],  [x0], x1
 167         st1             {v19.S}[1], [x0], x1
 168         st1             {v17.S}[2],  [x0], x1
 169         st1             {v16.S}[2], [x0], x1
 170         st1             {v0.S}[2],  [x0], x1
 171         st1             {v19.S}[2], [x0], x1
 172         st1             {v17.S}[3],  [x0], x1
 173         st1             {v16.S}[3], [x0], x1
 174         st1             {v0.S}[3],  [x0], x1
 175         st1             {v19.S}[3], [x0], x1
 176
 177         ret
 178 endfunc
 179
 180 .macro  h264_loop_filter_chroma
 181         dup             v22.8B, w2              // alpha
 182         uxtl            v24.8H, v24.8B
 183         uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
 184         uxtl            v4.8H,  v0.8B
 185         uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
 186         usubw           v4.8H,  v4.8H,  v16.8B
 187         sli             v24.8H, v24.8H, #8
 188         shl             v4.8H,  v4.8H,  #2
 189         uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
 190         uaddw           v4.8H,  v4.8H,  v18.8B
 191         cmhi            v26.8B, v22.8B, v26.8B  // < alpha
 192         usubw           v4.8H,  v4.8H,  v2.8B
 193         dup             v22.8B, w3              // beta
 194         rshrn           v4.8B,  v4.8H,  #3
 195         cmhi            v28.8B, v22.8B, v28.8B  // < beta
 196         cmhi            v30.8B, v22.8B, v30.8B  // < beta
 197         smin            v4.8B,  v4.8B,  v24.8B
 198         neg             v25.8B, v24.8B
 199         and             v26.8B, v26.8B, v28.8B
 200         smax            v4.8B,  v4.8B,  v25.8B
 201         and             v26.8B, v26.8B, v30.8B
 202         uxtl            v22.8H, v0.8B
 203         and             v4.8B,  v4.8B,  v26.8B
 204         uxtl            v28.8H, v16.8B
 205         saddw           v28.8H, v28.8H, v4.8B
 206         ssubw           v22.8H, v22.8H, v4.8B
 207         sqxtun          v16.8B, v28.8H
 208         sqxtun          v0.8B,  v22.8H
 209 .endm
 210
 211 function ff_h264_v_loop_filter_chroma_neon, export=1
 212         h264_loop_filter_start
 213
 214         sub             x0,  x0,  x1, lsl #1
 215         ld1             {v18.8B}, [x0], x1
 216         ld1             {v16.8B}, [x0], x1
 217         ld1             {v0.8B},  [x0], x1
 218         ld1             {v2.8B},  [x0]
 219
 220         h264_loop_filter_chroma
 221
 222         sub             x0,  x0,  x1, lsl #1
 223         st1             {v16.8B}, [x0], x1
 224         st1             {v0.8B},  [x0], x1
 225
 226         ret
 227 endfunc
 228
 229 function ff_h264_h_loop_filter_chroma_neon, export=1
 230         h264_loop_filter_start
 231
 232         sub             x0,  x0,  #2
 233         ld1             {v18.S}[0], [x0], x1
 234         ld1             {v16.S}[0], [x0], x1
 235         ld1             {v0.S}[0],  [x0], x1
 236         ld1             {v2.S}[0],  [x0], x1
 237         ld1             {v18.S}[1], [x0], x1
 238         ld1             {v16.S}[1], [x0], x1
 239         ld1             {v0.S}[1],  [x0], x1
 240         ld1             {v2.S}[1],  [x0], x1
 241
 242         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
 243
 244         h264_loop_filter_chroma
 245
 246         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
 247
 248         sub             x0,  x0,  x1, lsl #3
 249         st1             {v18.S}[0], [x0], x1
 250         st1             {v16.S}[0], [x0], x1
 251         st1             {v0.S}[0],  [x0], x1
 252         st1             {v2.S}[0],  [x0], x1
 253         st1             {v18.S}[1], [x0], x1
 254         st1             {v16.S}[1], [x0], x1
 255         st1             {v0.S}[1],  [x0], x1
 256         st1             {v2.S}[1],  [x0], x1
 257
 258         ret
 259 endfunc
 260
 261 .macro  biweight_16     macs, macd
 262         dup             v0.16B,  w5
 263         dup             v1.16B,  w6
 264         mov             v4.16B,  v16.16B
 265         mov             v6.16B,  v16.16B
 266 1:      subs            w3,  w3,  #2
 267         ld1             {v20.16B}, [x0], x2
 268         \macd           v4.8H,   v0.8B,  v20.8B
 269         \macd\()2       v6.8H,   v0.16B, v20.16B
 270         ld1             {v22.16B}, [x1], x2
 271         \macs           v4.8H,   v1.8B,  v22.8B
 272         \macs\()2       v6.8H,   v1.16B, v22.16B
 273         mov             v24.16B, v16.16B
 274         ld1             {v28.16B}, [x0], x2
 275         mov             v26.16B, v16.16B
 276         \macd           v24.8H,  v0.8B,  v28.8B
 277         \macd\()2       v26.8H,  v0.16B, v28.16B
 278         ld1             {v30.16B}, [x1], x2
 279         \macs           v24.8H,  v1.8B,  v30.8B
 280         \macs\()2       v26.8H,  v1.16B, v30.16B
 281         sshl            v4.8H,   v4.8H,  v18.8H
 282         sshl            v6.8H,   v6.8H,  v18.8H
 283         sqxtun          v4.8B,   v4.8H
 284         sqxtun2         v4.16B,  v6.8H
 285         sshl            v24.8H,  v24.8H, v18.8H
 286         sshl            v26.8H,  v26.8H, v18.8H
 287         sqxtun          v24.8B,  v24.8H
 288         sqxtun2         v24.16B, v26.8H
 289         mov             v6.16B,  v16.16B
 290         st1             {v4.16B},  [x7], x2
 291         mov             v4.16B,  v16.16B
 292         st1             {v24.16B}, [x7], x2
 293         b.ne            1b
 294         ret
 295 .endm
 296
 297 .macro  biweight_8      macs, macd
 298         dup             v0.8B,  w5
 299         dup             v1.8B,  w6
 300         mov             v2.16B,  v16.16B
 301         mov             v20.16B, v16.16B
 302 1:      subs            w3,  w3,  #2
 303         ld1             {v4.8B}, [x0], x2
 304         \macd           v2.8H,  v0.8B,  v4.8B
 305         ld1             {v5.8B}, [x1], x2
 306         \macs           v2.8H,  v1.8B,  v5.8B
 307         ld1             {v6.8B}, [x0], x2
 308         \macd           v20.8H, v0.8B,  v6.8B
 309         ld1             {v7.8B}, [x1], x2
 310         \macs           v20.8H, v1.8B,  v7.8B
 311         sshl            v2.8H,  v2.8H,  v18.8H
 312         sqxtun          v2.8B,  v2.8H
 313         sshl            v20.8H, v20.8H, v18.8H
 314         sqxtun          v4.8B,  v20.8H
 315         mov             v20.16B, v16.16B
 316         st1             {v2.8B}, [x7], x2
 317         mov             v2.16B,  v16.16B
 318         st1             {v4.8B}, [x7], x2
 319         b.ne            1b
 320         ret
 321 .endm
 322
 323 .macro  biweight_4      macs, macd
 324         dup             v0.8B,  w5
 325         dup             v1.8B,  w6
 326         mov             v2.16B, v16.16B
 327         mov             v20.16B,v16.16B
 328 1:      subs            w3,  w3,  #4
 329         ld1             {v4.S}[0], [x0], x2
 330         ld1             {v4.S}[1], [x0], x2
 331         \macd           v2.8H,  v0.8B,  v4.8B
 332         ld1             {v5.S}[0], [x1], x2
 333         ld1             {v5.S}[1], [x1], x2
 334         \macs           v2.8H,  v1.8B,  v5.8B
 335         b.lt            2f
 336         ld1             {v6.S}[0], [x0], x2
 337         ld1             {v6.S}[1], [x0], x2
 338         \macd           v20.8H, v0.8B,  v6.8B
 339         ld1             {v7.S}[0], [x1], x2
 340         ld1             {v7.S}[1], [x1], x2
 341         \macs           v20.8H, v1.8B,  v7.8B
 342         sshl            v2.8H,  v2.8H,  v18.8H
 343         sqxtun          v2.8B,  v2.8H
 344         sshl            v20.8H, v20.8H, v18.8H
 345         sqxtun          v4.8B,  v20.8H
 346         mov             v20.16B, v16.16B
 347         st1             {v2.S}[0], [x7], x2
 348         st1             {v2.S}[1], [x7], x2
 349         mov             v2.16B,  v16.16B
 350         st1             {v4.S}[0], [x7], x2
 351         st1             {v4.S}[1], [x7], x2
 352         b.ne            1b
 353         ret
 354 2:      sshl            v2.8H,  v2.8H,  v18.8H
 355         sqxtun          v2.8B,  v2.8H
 356         st1             {v2.S}[0], [x7], x2
 357         st1             {v2.S}[1], [x7], x2
 358         ret
 359 .endm
 360
 361 .macro  biweight_func   w
 362 function ff_biweight_h264_pixels_\w\()_neon, export=1
 363         sxtw            x2,  w2
 364         lsr             w8,  w5,  #31
 365         add             w7,  w7,  #1
 366         eor             w8,  w8,  w6,  lsr #30
 367         orr             w7,  w7,  #1
 368         dup             v18.8H,   w4
 369         lsl             w7,  w7,  w4
 370         not             v18.16B,  v18.16B
 371         dup             v16.8H,   w7
 372         mov             x7,  x0
 373         cbz             w8,  10f
 374         subs            w8,  w8,  #1
 375         b.eq            20f
 376         subs            w8,  w8,  #1
 377         b.eq            30f
 378         b               40f
 379 10:     biweight_\w     umlal, umlal
 380 20:     neg             w5, w5
 381         biweight_\w     umlal, umlsl
 382 30:     neg             w5, w5
 383         neg             w6, w6
 384         biweight_\w     umlsl, umlsl
 385 40:     neg             w6, w6
 386         biweight_\w     umlsl, umlal
 387 endfunc
 388 .endm
 389
 390         biweight_func   16
 391         biweight_func   8
 392         biweight_func   4
 393
 394 .macro  weight_16       add
 395         dup             v0.16B,  w4
 396 1:      subs            w2,  w2,  #2
 397         ld1             {v20.16B}, [x0], x1
 398         umull           v4.8H,   v0.8B,  v20.8B
 399         umull2          v6.8H,   v0.16B, v20.16B
 400         ld1             {v28.16B}, [x0], x1
 401         umull           v24.8H,  v0.8B,  v28.8B
 402         umull2          v26.8H,  v0.16B, v28.16B
 403         \add            v4.8H,   v16.8H, v4.8H
 404         srshl           v4.8H,   v4.8H,  v18.8H
 405         \add            v6.8H,   v16.8H, v6.8H
 406         srshl           v6.8H,   v6.8H,  v18.8H
 407         sqxtun          v4.8B,   v4.8H
 408         sqxtun2         v4.16B,  v6.8H
 409         \add            v24.8H,  v16.8H, v24.8H
 410         srshl           v24.8H,  v24.8H, v18.8H
 411         \add            v26.8H,  v16.8H, v26.8H
 412         srshl           v26.8H,  v26.8H, v18.8H
 413         sqxtun          v24.8B,  v24.8H
 414         sqxtun2         v24.16B, v26.8H
 415         st1             {v4.16B},  [x5], x1
 416         st1             {v24.16B}, [x5], x1
 417         b.ne            1b
 418         ret
 419 .endm
 420
 421 .macro  weight_8        add
 422         dup             v0.8B,  w4
 423 1:      subs            w2,  w2,  #2
 424         ld1             {v4.8B}, [x0], x1
 425         umull           v2.8H,  v0.8B,  v4.8B
 426         ld1             {v6.8B}, [x0], x1
 427         umull           v20.8H, v0.8B,  v6.8B
 428         \add            v2.8H,  v16.8H,  v2.8H
 429         srshl           v2.8H,  v2.8H,  v18.8H
 430         sqxtun          v2.8B,  v2.8H
 431         \add            v20.8H, v16.8H,  v20.8H
 432         srshl           v20.8H, v20.8H, v18.8H
 433         sqxtun          v4.8B,  v20.8H
 434         st1             {v2.8B}, [x5], x1
 435         st1             {v4.8B}, [x5], x1
 436         b.ne            1b
 437         ret
 438 .endm
 439
 440 .macro  weight_4        add
 441         dup             v0.8B,  w4
 442 1:      subs            w2,  w2,  #4
 443         ld1             {v4.S}[0], [x0], x1
 444         ld1             {v4.S}[1], [x0], x1
 445         umull           v2.8H,  v0.8B,  v4.8B
 446         b.lt            2f
 447         ld1             {v6.S}[0], [x0], x1
 448         ld1             {v6.S}[1], [x0], x1
 449         umull           v20.8H, v0.8B,  v6.8B
 450         \add            v2.8H,  v16.8H,  v2.8H
 451         srshl           v2.8H,  v2.8H,  v18.8H
 452         sqxtun          v2.8B,  v2.8H
 453         \add            v20.8H, v16.8H,  v20.8H
 454         srshl           v20.8H, v20.8h, v18.8H
 455         sqxtun          v4.8B,  v20.8H
 456         st1             {v2.S}[0], [x5], x1
 457         st1             {v2.S}[1], [x5], x1
 458         st1             {v4.S}[0], [x5], x1
 459         st1             {v4.S}[1], [x5], x1
 460         b.ne            1b
 461         ret
 462 2:      \add            v2.8H,  v16.8H,  v2.8H
 463         srshl           v2.8H,  v2.8H,  v18.8H
 464         sqxtun          v2.8B,  v2.8H
 465         st1             {v2.S}[0], [x5], x1
 466         st1             {v2.S}[1], [x5], x1
 467         ret
 468 .endm
 469
 470 .macro  weight_func     w
 471 function ff_weight_h264_pixels_\w\()_neon, export=1
 472         sxtw            x1,  w1
 473         cmp             w3,  #1
 474         mov             w6,  #1
 475         lsl             w5,  w5,  w3
 476         dup             v16.8H,  w5
 477         mov             x5,  x0
 478         b.le            20f
 479         sub             w6,  w6,  w3
 480         dup             v18.8H,  w6
 481         cmp             w4, #0
 482         b.lt            10f
 483         weight_\w       shadd
 484 10:     neg             w4,  w4
 485         weight_\w       shsub
 486 20:     neg             w6,  w3
 487         dup             v18.8H,  w6
 488         cmp             w4,  #0
 489         b.lt            10f
 490         weight_\w       add
 491 10:     neg             w4,  w4
 492         weight_\w       sub
 493 endfunc
 494 .endm
 495
 496         weight_func     16
 497         weight_func     8
 498         weight_func     4