git.sesse.net Git - x264/blob - common/aarch64/quant-a.S

   1 /****************************************************************************
   2  * quant.S: arm quantization and level-run
   3  *****************************************************************************
   4  * Copyright (C) 2009-2015 x264 project
   5  *
   6  * Authors: David Conrad <lessen42@gmail.com>
   7  *          Janne Grunau <janne-x264@jannau.net>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *
  23  * This program is also available under a commercial proprietary license.
  24  * For more information, contact us at licensing@x264.com.
  25  *****************************************************************************/
  26
  27 #include "asm.S"
  28
  29 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
  30     add         v18.8h, v18.8h, \bias0
  31     add         v19.8h, v19.8h, \bias1
  32     umull       v20.4s, v18.4h, \mf0_1\().4h
  33     umull2      v21.4s, v18.8h, \mf0_1\().8h
  34     umull       v22.4s, v19.4h, \mf2_3\().4h
  35     umull2      v23.4s, v19.8h, \mf2_3\().8h
  36     sshr        v16.8h, v16.8h, #15
  37     sshr        v17.8h, v17.8h, #15
  38     shrn        v18.4h, v20.4s, #16
  39     shrn2       v18.8h, v21.4s, #16
  40     shrn        v19.4h, v22.4s, #16
  41     shrn2       v19.8h, v23.4s, #16
  42     eor         v18.16b, v18.16b, v16.16b
  43     eor         v19.16b, v19.16b, v17.16b
  44     sub         v18.8h, v18.8h, v16.8h
  45     sub         v19.8h, v19.8h, v17.8h
  46     orr         \mask,  v18.16b, v19.16b
  47     st1        {v18.8h,v19.8h}, [x0], #32
  48 .endm
  49
  50 .macro QUANT_END d
  51     fmov        x2,  \d
  52     mov         w0,  #0
  53     tst         x2,  x2
  54     cinc        w0,  w0,  ne
  55     ret
  56 .endm
  57
  58 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
  59 function x264_quant_2x2_dc_neon, export=1
  60     ld1        {v0.4h}, [x0]
  61     dup         v2.4h,  w2
  62     dup         v1.4h,  w1
  63     abs         v3.4h,  v0.4h
  64     add         v3.4h,  v3.4h,  v2.4h
  65     umull       v3.4s,  v3.4h,  v1.4h
  66     sshr        v0.4h,  v0.4h,  #15
  67     shrn        v3.4h,  v3.4s,  #16
  68     eor         v3.8b,  v3.8b,  v0.8b
  69     sub         v3.4h,  v3.4h,  v0.4h
  70     st1        {v3.4h}, [x0]
  71     QUANT_END   d3
  72 endfunc
  73
  74 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
  75 function x264_quant_4x4_dc_neon, export=1
  76     ld1        {v16.8h,v17.8h}, [x0]
  77     abs         v18.8h,  v16.8h
  78     abs         v19.8h,  v17.8h
  79     dup         v0.8h,  w2
  80     dup         v2.8h,  w1
  81     QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
  82     uqxtn       v0.8b,  v0.8h
  83     QUANT_END   d0
  84 endfunc
  85
  86 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  87 function x264_quant_4x4_neon, export=1
  88     ld1        {v16.8h,v17.8h}, [x0]
  89     abs         v18.8h,  v16.8h
  90     abs         v19.8h,  v17.8h
  91     ld1        {v0.8h,v1.8h}, [x2]
  92     ld1        {v2.8h,v3.8h}, [x1]
  93     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
  94     uqxtn       v0.8b,  v0.8h
  95     QUANT_END   d0
  96 endfunc
  97
  98 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
  99 function x264_quant_4x4x4_neon, export=1
 100     ld1        {v16.8h,v17.8h}, [x0]
 101     abs         v18.8h, v16.8h
 102     abs         v19.8h, v17.8h
 103     ld1        {v0.8h,v1.8h}, [x2]
 104     ld1        {v2.8h,v3.8h}, [x1]
 105     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
 106     ld1        {v16.8h,v17.8h}, [x0]
 107     abs         v18.8h, v16.8h
 108     abs         v19.8h, v17.8h
 109     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
 110     ld1        {v16.8h,v17.8h}, [x0]
 111     abs         v18.8h, v16.8h
 112     abs         v19.8h, v17.8h
 113     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
 114     ld1        {v16.8h,v17.8h}, [x0]
 115     abs         v18.8h, v16.8h
 116     abs         v19.8h, v17.8h
 117     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
 118     uqxtn       v4.8b,  v4.8h
 119     uqxtn       v7.8b,  v7.8h
 120     uqxtn       v6.8b,  v6.8h
 121     uqxtn       v5.8b,  v5.8h
 122     fmov        x7,  d7
 123     fmov        x6,  d6
 124     fmov        x5,  d5
 125     fmov        x4,  d4
 126     mov         w0,  #0
 127     tst         x7,  x7
 128     cinc        w0,  w0,  ne
 129     lsl         w0,  w0,  #1
 130     tst         x6,  x6
 131     cinc        w0,  w0,  ne
 132     lsl         w0,  w0,  #1
 133     tst         x5,  x5
 134     cinc        w0,  w0,  ne
 135     lsl         w0,  w0,  #1
 136     tst         x4,  x4
 137     cinc        w0,  w0,  ne
 138     ret
 139 endfunc
 140
 141 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 142 function x264_quant_8x8_neon, export=1
 143     ld1        {v16.8h,v17.8h}, [x0]
 144     abs         v18.8h, v16.8h
 145     abs         v19.8h, v17.8h
 146     ld1        {v0.8h,v1.8h}, [x2], #32
 147     ld1        {v2.8h,v3.8h}, [x1], #32
 148     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
 149 .rept 3
 150     ld1        {v16.8h,v17.8h}, [x0]
 151     abs         v18.8h, v16.8h
 152     abs         v19.8h, v17.8h
 153     ld1        {v0.8h,v1.8h}, [x2], #32
 154     ld1        {v2.8h,v3.8h}, [x1], #32
 155     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
 156     orr         v4.16b, v4.16b, v5.16b
 157 .endr
 158     uqxtn       v0.8b,  v4.8h
 159     QUANT_END   d0
 160 endfunc
 161
 162 .macro DEQUANT_START mf_size offset dc=no
 163     mov         w3,  #0x2b
 164     mul         w3,  w3,  w2
 165     lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
 166     add         w5,  w3,  w3,  lsl #1
 167     sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
 168     lsl         w2,  w2,  #\mf_size
 169 .ifc \dc,no
 170     add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
 171 .else
 172     ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
 173 .endif
 174     subs        w3,  w3,  #\offset      // 6 for 8x8
 175 .endm
 176
 177 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 178 .macro DEQUANT size bits
 179 function x264_dequant_\size\()_neon, export=1
 180     DEQUANT_START \bits+2, \bits
 181 .ifc \size, 8x8
 182     mov         w2,  #4
 183 .endif
 184     b.lt        dequant_\size\()_rshift
 185
 186     dup         v31.8h, w3
 187 dequant_\size\()_lshift_loop:
 188 .ifc \size, 8x8
 189     subs        w2,  w2,  #1
 190 .endif
 191     ld1        {v16.4s}, [x1], #16
 192     ld1        {v17.4s}, [x1], #16
 193     sqxtn       v2.4h,  v16.4s
 194     ld1        {v18.4s}, [x1], #16
 195     sqxtn2      v2.8h,  v17.4s
 196     ld1        {v19.4s}, [x1], #16
 197     sqxtn       v3.4h,  v18.4s
 198     ld1        {v0.8h,v1.8h}, [x0]
 199     sqxtn2      v3.8h,  v19.4s
 200     mul         v0.8h,  v0.8h,  v2.8h
 201     mul         v1.8h,  v1.8h,  v3.8h
 202     sshl        v0.8h,  v0.8h,  v31.8h
 203     sshl        v1.8h,  v1.8h,  v31.8h
 204     st1        {v0.8h,v1.8h}, [x0], #32
 205 .ifc \size, 8x8
 206     b.gt        dequant_\size\()_lshift_loop
 207 .endif
 208     ret
 209
 210 dequant_\size\()_rshift:
 211     dup         v31.4s, w3
 212     neg         w3,  w3
 213     mov         w5,  #1
 214     sub         w3,  w3,  #1
 215     lsl         w5,  w5,  w3
 216
 217 .ifc \size, 8x8
 218 dequant_\size\()_rshift_loop:
 219     subs        w2,  w2,  #1
 220 .endif
 221     ld1        {v16.4s}, [x1], #16
 222     ld1        {v17.4s}, [x1], #16
 223     sqxtn       v2.4h,  v16.4s
 224     ld1        {v18.4s}, [x1], #16
 225     dup         v16.4s, w5
 226     sqxtn2      v2.8h,  v17.4s
 227     ld1        {v19.4s}, [x1], #16
 228     dup         v17.4s, w5
 229     sqxtn       v3.4h,  v18.4s
 230     ld1        {v0.8h,v1.8h}, [x0]
 231     dup         v18.4s, w5
 232     sqxtn2      v3.8h,  v19.4s
 233     dup         v19.4s, w5
 234
 235     smlal       v16.4s, v0.4h,  v2.4h
 236     smlal2      v17.4s, v0.8h,  v2.8h
 237     smlal       v18.4s, v1.4h,  v3.4h
 238     smlal2      v19.4s, v1.8h,  v3.8h
 239     sshl        v16.4s, v16.4s, v31.4s
 240     sshl        v17.4s, v17.4s, v31.4s
 241     sshl        v18.4s, v18.4s, v31.4s
 242     sshl        v19.4s, v19.4s, v31.4s
 243
 244     sqxtn       v0.4h,  v16.4s
 245     sqxtn2      v0.8h,  v17.4s
 246     sqxtn       v1.4h,  v18.4s
 247     sqxtn2      v1.8h,  v19.4s
 248     st1        {v0.8h,v1.8h}, [x0], #32
 249 .ifc \size, 8x8
 250     b.gt        dequant_\size\()_rshift_loop
 251 .endif
 252     ret
 253 endfunc
 254 .endm
 255
 256 DEQUANT 4x4, 4
 257 DEQUANT 8x8, 6
 258
 259 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 260 function x264_dequant_4x4_dc_neon, export=1
 261     DEQUANT_START 6, 6, yes
 262     b.lt        dequant_4x4_dc_rshift
 263
 264     lsl         w1,  w1,  w3
 265     dup         v2.8h,  w1
 266     ld1        {v0.8h,v1.8h},   [x0]
 267
 268     mul         v0.8h,  v0.8h,  v2.8h
 269     mul         v1.8h,  v1.8h,  v2.8h
 270     st1        {v0.8h,v1.8h},   [x0]
 271     ret
 272
 273 dequant_4x4_dc_rshift:
 274     dup         v4.8h,  w1
 275     dup         v3.4s, w3
 276     neg         w3,  w3
 277     mov         w5,  #1
 278     sub         w3,  w3,  #1
 279     lsl         w5,  w5,  w3
 280
 281     dup         v16.4s, w5
 282     dup         v17.4s, w5
 283     ld1        {v0.8h,v1.8h}, [x0]
 284     dup         v18.4s, w5
 285     dup         v19.4s, w5
 286
 287     smlal       v16.4s, v0.4h,  v4.4h
 288     smlal2      v17.4s, v0.8h,  v4.8h
 289     smlal       v18.4s, v1.4h,  v4.4h
 290     smlal2      v19.4s, v1.8h,  v4.8h
 291     sshl        v16.4s, v16.4s, v3.4s
 292     sshl        v17.4s, v17.4s, v3.4s
 293     sshl        v18.4s, v18.4s, v3.4s
 294     sshl        v19.4s, v19.4s, v3.4s
 295
 296     sqxtn       v0.4h,  v16.4s
 297     sqxtn2      v0.8h,  v17.4s
 298     sqxtn       v1.4h,  v18.4s
 299     sqxtn2      v1.8h,  v19.4s
 300     st1        {v0.8h,v1.8h}, [x0]
 301     ret
 302 endfunc
 303
 304 .macro decimate_score_1x size
 305 function x264_decimate_score\size\()_neon, export=1
 306     ld1        {v0.8h,v1.8h}, [x0]
 307     movrel      x5,  X(x264_decimate_table4)
 308     movi        v3.16b, #0x01
 309     sqxtn       v0.8b,  v0.8h
 310     sqxtn2      v0.16b, v1.8h
 311     abs         v2.16b, v0.16b
 312     cmeq        v1.16b, v0.16b, #0
 313     cmhi        v2.16b, v2.16b, v3.16b
 314     shrn        v1.8b,  v1.8h,  #4
 315     shrn        v2.8b,  v2.8h,  #4
 316     fmov        x2,  d2
 317     fmov        x1,  d1
 318     cbnz        x2,  9f
 319     mvn         x1,  x1
 320     mov         w0,  #0
 321     cbz         x1,  0f
 322 .ifc \size, 15
 323     lsr         x1,  x1,  #1
 324 .endif
 325     rbit        x1,  x1
 326 1:
 327     clz         x3,  x1
 328     lsr         x6,  x3,  #2
 329     lsl         x1,  x1,  x3
 330     ldrb        w7,  [x5, x6]
 331     lsl         x1,  x1,  #4
 332     add         w0,  w0,  w7
 333     cbnz        x1,  1b
 334     ret
 335 9:
 336     mov         w0,  #9
 337 0:
 338     ret
 339 endfunc
 340 .endm
 341
 342 decimate_score_1x 15
 343 decimate_score_1x 16
 344
 345 const mask64, align=6
 346     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
 347     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
 348 endconst
 349
 350 function x264_decimate_score64_neon, export=1
 351     ld1        {v0.8h,v1.8h}, [x0], #32
 352     ld1        {v2.8h,v3.8h}, [x0], #32
 353     ld1        {v4.8h,v5.8h}, [x0], #32
 354     ld1        {v6.8h,v7.8h}, [x0]
 355     movrel      x6,  mask64
 356     movi        v31.16b, #0x01
 357     sqxtn       v16.8b,  v1.8h
 358     sqxtn2      v16.16b, v0.8h
 359     sqxtn       v17.8b,  v3.8h
 360     sqxtn2      v17.16b, v2.8h
 361     sqxtn       v18.8b,  v5.8h
 362     sqxtn2      v18.16b, v4.8h
 363     sqxtn       v19.8b,  v7.8h
 364     sqxtn2      v19.16b, v6.8h
 365     abs         v4.16b, v16.16b
 366     abs         v5.16b, v17.16b
 367     abs         v6.16b, v18.16b
 368     abs         v7.16b, v19.16b
 369     ld1        {v30.16b}, [x6]
 370     cmeq        v0.16b, v16.16b, #0
 371     cmeq        v1.16b, v17.16b, #0
 372     cmeq        v2.16b, v18.16b, #0
 373     cmeq        v3.16b, v19.16b, #0
 374     umax        v4.16b, v4.16b, v5.16b
 375     umax        v6.16b, v6.16b, v7.16b
 376     and         v0.16b, v0.16b, v30.16b
 377     and         v1.16b, v1.16b, v30.16b
 378     and         v2.16b, v2.16b, v30.16b
 379     and         v3.16b, v3.16b, v30.16b
 380     umax        v4.16b, v4.16b, v6.16b
 381     addp        v0.16b, v1.16b, v0.16b
 382     addp        v2.16b, v3.16b, v2.16b
 383     cmhi        v4.16b, v4.16b, v31.16b
 384     addp        v0.16b, v2.16b, v0.16b
 385     shrn        v4.8b,  v4.8h,  #4
 386     addp        v0.16b, v0.16b, v0.16b
 387     fmov        x2,  d4
 388     fmov        x1,  d0
 389     cbnz        x2,  9f
 390     mvn         x1,  x1
 391     mov         w0,  #0
 392     cbz         x1,  0f
 393     movrel      x5,  X(x264_decimate_table8)
 394 1:
 395     clz         x3,  x1
 396     lsl         x1,  x1,  x3
 397     ldrb        w7,  [x5, x3]
 398     lsl         x1,  x1,  #1
 399     add         w0,  w0,  w7
 400     cbnz        x1,  1b
 401     ret
 402 9:
 403     mov         w0,  #9
 404 0:
 405     ret
 406 endfunc
 407
 408 // int coeff_last( int16_t *l )
 409 function x264_coeff_last4_aarch64, export=1
 410     ldr         x2,  [x0]
 411     mov         w4,  #3
 412     clz         x0,  x2
 413     sub         w0,  w4,  w0, lsr #4
 414     ret
 415 endfunc
 416
 417 function x264_coeff_last8_aarch64, export=1
 418     ldr         x3,  [x0, #8]
 419     mov         w4,  #7
 420     clz         x2,  x3
 421     cmp         w2,  #64
 422     b.ne        1f
 423     ldr         x3,  [x0]
 424     sub         w4,  w4,  #4
 425     clz         x2,  x3
 426 1:
 427     sub         w0,  w4,  w2, lsr #4
 428     ret
 429 endfunc
 430
 431 .macro COEFF_LAST_1x size
 432 function x264_coeff_last\size\()_neon, export=1
 433 .if \size == 15
 434     sub         x0,  x0,  #2
 435 .endif
 436     ld1        {v0.8h,v1.8h}, [x0]
 437     uqxtn       v0.8b,  v0.8h
 438     uqxtn2      v0.16b, v1.8h
 439     cmtst       v0.16b, v0.16b, v0.16b
 440     shrn        v0.8b,  v0.8h,  #4
 441     fmov        x1,  d0
 442     mov         w3,  #\size - 1
 443     clz         x2,  x1
 444     sub         w0,  w3,  w2, lsr #2
 445     ret
 446 endfunc
 447 .endm
 448
 449 COEFF_LAST_1x 15
 450 COEFF_LAST_1x 16
 451
 452 function x264_coeff_last64_neon, export=1
 453     ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
 454     movi        v31.8h,  #8
 455     movi        v30.8h,  #1
 456     uqxtn       v0.8b,  v0.8h
 457     uqxtn2      v0.16b, v1.8h
 458     ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
 459     uqxtn       v1.8b,  v2.8h
 460     uqxtn2      v1.16b, v3.8h
 461     uqxtn       v2.8b,  v4.8h
 462     uqxtn2      v2.16b, v5.8h
 463     uqxtn       v3.8b,  v6.8h
 464     uqxtn2      v3.16b, v7.8h
 465
 466     cmtst       v0.16b, v0.16b, v0.16b
 467     cmtst       v1.16b, v1.16b, v1.16b
 468     cmtst       v2.16b, v2.16b, v2.16b
 469     cmtst       v3.16b, v3.16b, v3.16b
 470
 471     shrn        v0.8b,  v0.8h,  #4
 472     shrn2       v0.16b, v1.8h,  #4
 473     shrn        v1.8b,  v2.8h,  #4
 474     shrn2       v1.16b, v3.8h,  #4
 475
 476     clz         v0.4s,  v0.4s
 477     clz         v1.4s,  v1.4s
 478
 479     shrn        v0.4h,  v0.4s,  #2
 480     shrn2       v0.8h,  v1.4s,  #2
 481
 482     sub         v0.8h,  v31.8h,  v0.8h
 483     sshl        v0.8h,  v30.8h,  v0.8h
 484     shrn        v0.8b,  v0.8h,  #1
 485
 486     fmov        x2,  d0
 487     mov         w3,  #63
 488     clz         x2,  x2
 489     sub         w0,  w3,  w2
 490     ret
 491 endfunc
 492
 493 .macro coeff_level_run_start size
 494     add         x6,  x1,  #23            // runlevel->mask
 495     mov         w7,  #0
 496     mov         w8,  #0
 497     mov         w9,  #1
 498     and         x6,  x6,  #~15
 499     mov         w4,  #\size - 1
 500 .endm
 501
 502 .macro coeff_level_run shift
 503     clz         x3,  x2
 504     subs        w4,  w4,  w3, lsr #\shift
 505     str         w4,  [x1], #4
 506 1:
 507     ldrh        w5,  [x0, x4, lsl #1]
 508     strh        w5,  [x6], #2
 509     add         w7,  w7,  #1
 510     lsl         w10, w9, w4
 511     orr         w8,  w8,  w10
 512     b.le        2f
 513     add         w3,  w3,  #1 << \shift
 514     sub         w4,  w4,  #1
 515     and         x3,  x3,  #~((1 << \shift) - 1)
 516     lsl         x2,  x2,  x3
 517     clz         x3,  x2
 518     subs        w4,  w4,  w3, lsr #\shift
 519     b.ge        1b
 520 2:
 521     str         w8,  [x1]
 522     mov         w0,  w7
 523 .endm
 524
 525 function x264_coeff_level_run4_aarch64, export=1
 526     ldr         x2,  [x0]
 527
 528     coeff_level_run_start 4
 529
 530     coeff_level_run 4
 531
 532     ret
 533 endfunc
 534
 535 .macro X264_COEFF_LEVEL_RUN size
 536 function x264_coeff_level_run\size\()_neon, export=1
 537 .if \size == 15
 538     sub         x0,  x0,  #2
 539 .endif
 540 .if         \size < 15
 541     ld1         {v0.8h}, [x0]
 542     uqxtn       v0.8b,  v0.8h
 543     cmtst       v0.8b,  v0.8b,  v0.8b
 544 .else
 545     ld1         {v0.8h,v1.8h}, [x0]
 546     uqxtn       v0.8b,  v0.8h
 547     uqxtn2      v0.16b, v1.8h
 548     cmtst       v0.16b, v0.16b, v0.16b
 549     shrn        v0.8b,  v0.8h,  #4
 550 .endif
 551     fmov        x2,  d0
 552 .if \size == 15
 553     add         x0,  x0,  #2
 554 .endif
 555
 556     coeff_level_run_start \size
 557
 558     coeff_level_run (4 - (\size + 1) / 8)
 559
 560     ret
 561 endfunc
 562 .endm
 563
 564 X264_COEFF_LEVEL_RUN 8
 565 X264_COEFF_LEVEL_RUN 15
 566 X264_COEFF_LEVEL_RUN 16
 567
 568 function x264_denoise_dct_neon, export=1
 569 1:  subs        w3,  w3,  #16
 570     ld1         {v0.8h,v1.8h}, [x0]
 571     ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
 572     abs         v16.8h,  v0.8h
 573     abs         v17.8h,  v1.8h
 574     ld1         {v2.8h,v3.8h}, [x2], #32
 575     cmlt        v18.8h,  v0.8h,   #0
 576     cmlt        v19.8h,  v1.8h,   #0
 577     uaddw       v4.4s,   v4.4s,   v16.4h
 578     uaddw2      v5.4s,   v5.4s,   v16.8h
 579     uqsub       v20.8h,  v16.8h,  v2.8h
 580     uqsub       v21.8h,  v17.8h,  v3.8h
 581     uaddw       v6.4s,   v6.4s,   v17.4h
 582     uaddw2      v7.4s,   v7.4s,   v17.8h
 583     neg         v22.8h,  v20.8h
 584     neg         v23.8h,  v21.8h
 585     bsl         v18.16b, v22.16b, v20.16b
 586     bsl         v19.16b, v23.16b, v21.16b
 587     st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
 588     st1         {v18.8h,v19.8h}, [x0], #32
 589     b.gt        1b
 590     ret
 591 endfunc