git.sesse.net Git - x264/blob - common/aarch64/quant-a.S

   1 /****************************************************************************
   2  * quant.S: arm quantization and level-run
   3  *****************************************************************************
   4  * Copyright (C) 2009-2015 x264 project
   5  *
   6  * Authors: David Conrad <lessen42@gmail.com>
   7  *          Janne Grunau <janne-x264@jannau.net>
   8  *          Martin Storsjo <martin@martin.st>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #include "asm.S"
  29
  30 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
  31     add         v18.8h, v18.8h, \bias0
  32     add         v19.8h, v19.8h, \bias1
  33     umull       v20.4s, v18.4h, \mf0_1\().4h
  34     umull2      v21.4s, v18.8h, \mf0_1\().8h
  35     umull       v22.4s, v19.4h, \mf2_3\().4h
  36     umull2      v23.4s, v19.8h, \mf2_3\().8h
  37     sshr        v16.8h, v16.8h, #15
  38     sshr        v17.8h, v17.8h, #15
  39     shrn        v18.4h, v20.4s, #16
  40     shrn2       v18.8h, v21.4s, #16
  41     shrn        v19.4h, v22.4s, #16
  42     shrn2       v19.8h, v23.4s, #16
  43     eor         v18.16b, v18.16b, v16.16b
  44     eor         v19.16b, v19.16b, v17.16b
  45     sub         v18.8h, v18.8h, v16.8h
  46     sub         v19.8h, v19.8h, v17.8h
  47     orr         \mask,  v18.16b, v19.16b
  48     st1        {v18.8h,v19.8h}, [x0], #32
  49 .endm
  50
  51 .macro QUANT_END d
  52     fmov        x2,  \d
  53     mov         w0,  #0
  54     tst         x2,  x2
  55     cinc        w0,  w0,  ne
  56     ret
  57 .endm
  58
  59 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
  60 function x264_quant_2x2_dc_neon, export=1
  61     ld1        {v0.4h}, [x0]
  62     dup         v2.4h,  w2
  63     dup         v1.4h,  w1
  64     abs         v3.4h,  v0.4h
  65     add         v3.4h,  v3.4h,  v2.4h
  66     umull       v3.4s,  v3.4h,  v1.4h
  67     sshr        v0.4h,  v0.4h,  #15
  68     shrn        v3.4h,  v3.4s,  #16
  69     eor         v3.8b,  v3.8b,  v0.8b
  70     sub         v3.4h,  v3.4h,  v0.4h
  71     st1        {v3.4h}, [x0]
  72     QUANT_END   d3
  73 endfunc
  74
  75 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
  76 function x264_quant_4x4_dc_neon, export=1
  77     ld1        {v16.8h,v17.8h}, [x0]
  78     abs         v18.8h,  v16.8h
  79     abs         v19.8h,  v17.8h
  80     dup         v0.8h,  w2
  81     dup         v2.8h,  w1
  82     QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
  83     uqxtn       v0.8b,  v0.8h
  84     QUANT_END   d0
  85 endfunc
  86
  87 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  88 function x264_quant_4x4_neon, export=1
  89     ld1        {v16.8h,v17.8h}, [x0]
  90     abs         v18.8h,  v16.8h
  91     abs         v19.8h,  v17.8h
  92     ld1        {v0.8h,v1.8h}, [x2]
  93     ld1        {v2.8h,v3.8h}, [x1]
  94     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
  95     uqxtn       v0.8b,  v0.8h
  96     QUANT_END   d0
  97 endfunc
  98
  99 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
 100 function x264_quant_4x4x4_neon, export=1
 101     ld1        {v16.8h,v17.8h}, [x0]
 102     abs         v18.8h, v16.8h
 103     abs         v19.8h, v17.8h
 104     ld1        {v0.8h,v1.8h}, [x2]
 105     ld1        {v2.8h,v3.8h}, [x1]
 106     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
 107     ld1        {v16.8h,v17.8h}, [x0]
 108     abs         v18.8h, v16.8h
 109     abs         v19.8h, v17.8h
 110     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
 111     ld1        {v16.8h,v17.8h}, [x0]
 112     abs         v18.8h, v16.8h
 113     abs         v19.8h, v17.8h
 114     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
 115     ld1        {v16.8h,v17.8h}, [x0]
 116     abs         v18.8h, v16.8h
 117     abs         v19.8h, v17.8h
 118     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
 119     uqxtn       v4.8b,  v4.8h
 120     uqxtn       v7.8b,  v7.8h
 121     uqxtn       v6.8b,  v6.8h
 122     uqxtn       v5.8b,  v5.8h
 123     fmov        x7,  d7
 124     fmov        x6,  d6
 125     fmov        x5,  d5
 126     fmov        x4,  d4
 127     mov         w0,  #0
 128     tst         x7,  x7
 129     cinc        w0,  w0,  ne
 130     lsl         w0,  w0,  #1
 131     tst         x6,  x6
 132     cinc        w0,  w0,  ne
 133     lsl         w0,  w0,  #1
 134     tst         x5,  x5
 135     cinc        w0,  w0,  ne
 136     lsl         w0,  w0,  #1
 137     tst         x4,  x4
 138     cinc        w0,  w0,  ne
 139     ret
 140 endfunc
 141
 142 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 143 function x264_quant_8x8_neon, export=1
 144     ld1        {v16.8h,v17.8h}, [x0]
 145     abs         v18.8h, v16.8h
 146     abs         v19.8h, v17.8h
 147     ld1        {v0.8h,v1.8h}, [x2], #32
 148     ld1        {v2.8h,v3.8h}, [x1], #32
 149     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
 150 .rept 3
 151     ld1        {v16.8h,v17.8h}, [x0]
 152     abs         v18.8h, v16.8h
 153     abs         v19.8h, v17.8h
 154     ld1        {v0.8h,v1.8h}, [x2], #32
 155     ld1        {v2.8h,v3.8h}, [x1], #32
 156     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
 157     orr         v4.16b, v4.16b, v5.16b
 158 .endr
 159     uqxtn       v0.8b,  v4.8h
 160     QUANT_END   d0
 161 endfunc
 162
 163 .macro DEQUANT_START mf_size offset dc=no
 164     mov         w3,  #0x2b
 165     mul         w3,  w3,  w2
 166     lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
 167     add         w5,  w3,  w3,  lsl #1
 168     sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
 169     lsl         w2,  w2,  #\mf_size
 170 .ifc \dc,no
 171     add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
 172 .else
 173     ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
 174 .endif
 175     subs        w3,  w3,  #\offset      // 6 for 8x8
 176 .endm
 177
 178 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 179 .macro DEQUANT size bits
 180 function x264_dequant_\size\()_neon, export=1
 181     DEQUANT_START \bits+2, \bits
 182 .ifc \size, 8x8
 183     mov         w2,  #4
 184 .endif
 185     b.lt        dequant_\size\()_rshift
 186
 187     dup         v31.8h, w3
 188 dequant_\size\()_lshift_loop:
 189 .ifc \size, 8x8
 190     subs        w2,  w2,  #1
 191 .endif
 192     ld1        {v16.4s}, [x1], #16
 193     ld1        {v17.4s}, [x1], #16
 194     sqxtn       v2.4h,  v16.4s
 195     ld1        {v18.4s}, [x1], #16
 196     sqxtn2      v2.8h,  v17.4s
 197     ld1        {v19.4s}, [x1], #16
 198     sqxtn       v3.4h,  v18.4s
 199     ld1        {v0.8h,v1.8h}, [x0]
 200     sqxtn2      v3.8h,  v19.4s
 201     mul         v0.8h,  v0.8h,  v2.8h
 202     mul         v1.8h,  v1.8h,  v3.8h
 203     sshl        v0.8h,  v0.8h,  v31.8h
 204     sshl        v1.8h,  v1.8h,  v31.8h
 205     st1        {v0.8h,v1.8h}, [x0], #32
 206 .ifc \size, 8x8
 207     b.gt        dequant_\size\()_lshift_loop
 208 .endif
 209     ret
 210
 211 dequant_\size\()_rshift:
 212     dup         v31.4s, w3
 213     neg         w3,  w3
 214     mov         w5,  #1
 215     sub         w3,  w3,  #1
 216     lsl         w5,  w5,  w3
 217
 218 .ifc \size, 8x8
 219 dequant_\size\()_rshift_loop:
 220     subs        w2,  w2,  #1
 221 .endif
 222     ld1        {v16.4s}, [x1], #16
 223     ld1        {v17.4s}, [x1], #16
 224     sqxtn       v2.4h,  v16.4s
 225     ld1        {v18.4s}, [x1], #16
 226     dup         v16.4s, w5
 227     sqxtn2      v2.8h,  v17.4s
 228     ld1        {v19.4s}, [x1], #16
 229     dup         v17.4s, w5
 230     sqxtn       v3.4h,  v18.4s
 231     ld1        {v0.8h,v1.8h}, [x0]
 232     dup         v18.4s, w5
 233     sqxtn2      v3.8h,  v19.4s
 234     dup         v19.4s, w5
 235
 236     smlal       v16.4s, v0.4h,  v2.4h
 237     smlal2      v17.4s, v0.8h,  v2.8h
 238     smlal       v18.4s, v1.4h,  v3.4h
 239     smlal2      v19.4s, v1.8h,  v3.8h
 240     sshl        v16.4s, v16.4s, v31.4s
 241     sshl        v17.4s, v17.4s, v31.4s
 242     sshl        v18.4s, v18.4s, v31.4s
 243     sshl        v19.4s, v19.4s, v31.4s
 244
 245     sqxtn       v0.4h,  v16.4s
 246     sqxtn2      v0.8h,  v17.4s
 247     sqxtn       v1.4h,  v18.4s
 248     sqxtn2      v1.8h,  v19.4s
 249     st1        {v0.8h,v1.8h}, [x0], #32
 250 .ifc \size, 8x8
 251     b.gt        dequant_\size\()_rshift_loop
 252 .endif
 253     ret
 254 endfunc
 255 .endm
 256
 257 DEQUANT 4x4, 4
 258 DEQUANT 8x8, 6
 259
 260 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 261 function x264_dequant_4x4_dc_neon, export=1
 262     DEQUANT_START 6, 6, yes
 263     b.lt        dequant_4x4_dc_rshift
 264
 265     lsl         w1,  w1,  w3
 266     dup         v2.8h,  w1
 267     ld1        {v0.8h,v1.8h},   [x0]
 268
 269     mul         v0.8h,  v0.8h,  v2.8h
 270     mul         v1.8h,  v1.8h,  v2.8h
 271     st1        {v0.8h,v1.8h},   [x0]
 272     ret
 273
 274 dequant_4x4_dc_rshift:
 275     dup         v4.8h,  w1
 276     dup         v3.4s, w3
 277     neg         w3,  w3
 278     mov         w5,  #1
 279     sub         w3,  w3,  #1
 280     lsl         w5,  w5,  w3
 281
 282     dup         v16.4s, w5
 283     dup         v17.4s, w5
 284     ld1        {v0.8h,v1.8h}, [x0]
 285     dup         v18.4s, w5
 286     dup         v19.4s, w5
 287
 288     smlal       v16.4s, v0.4h,  v4.4h
 289     smlal2      v17.4s, v0.8h,  v4.8h
 290     smlal       v18.4s, v1.4h,  v4.4h
 291     smlal2      v19.4s, v1.8h,  v4.8h
 292     sshl        v16.4s, v16.4s, v3.4s
 293     sshl        v17.4s, v17.4s, v3.4s
 294     sshl        v18.4s, v18.4s, v3.4s
 295     sshl        v19.4s, v19.4s, v3.4s
 296
 297     sqxtn       v0.4h,  v16.4s
 298     sqxtn2      v0.8h,  v17.4s
 299     sqxtn       v1.4h,  v18.4s
 300     sqxtn2      v1.8h,  v19.4s
 301     st1        {v0.8h,v1.8h}, [x0]
 302     ret
 303 endfunc
 304
 305 .macro decimate_score_1x size
 306 function x264_decimate_score\size\()_neon, export=1
 307     ld1        {v0.8h,v1.8h}, [x0]
 308     movrel      x5,  X(x264_decimate_table4)
 309     movi        v3.16b, #0x01
 310     sqxtn       v0.8b,  v0.8h
 311     sqxtn2      v0.16b, v1.8h
 312     abs         v2.16b, v0.16b
 313     cmeq        v1.16b, v0.16b, #0
 314     cmhi        v2.16b, v2.16b, v3.16b
 315     shrn        v1.8b,  v1.8h,  #4
 316     shrn        v2.8b,  v2.8h,  #4
 317     fmov        x2,  d2
 318     fmov        x1,  d1
 319     cbnz        x2,  9f
 320     mvn         x1,  x1
 321     mov         w0,  #0
 322     cbz         x1,  0f
 323 .ifc \size, 15
 324     lsr         x1,  x1,  #1
 325 .endif
 326     rbit        x1,  x1
 327 1:
 328     clz         x3,  x1
 329     lsr         x6,  x3,  #2
 330     lsl         x1,  x1,  x3
 331     ldrb        w7,  [x5, x6]
 332     lsl         x1,  x1,  #4
 333     add         w0,  w0,  w7
 334     cbnz        x1,  1b
 335     ret
 336 9:
 337     mov         w0,  #9
 338 0:
 339     ret
 340 endfunc
 341 .endm
 342
 343 decimate_score_1x 15
 344 decimate_score_1x 16
 345
 346 const mask64, align=6
 347     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
 348     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
 349 endconst
 350
 351 function x264_decimate_score64_neon, export=1
 352     ld1        {v0.8h,v1.8h}, [x0], #32
 353     ld1        {v2.8h,v3.8h}, [x0], #32
 354     ld1        {v4.8h,v5.8h}, [x0], #32
 355     ld1        {v6.8h,v7.8h}, [x0]
 356     movrel      x6,  mask64
 357     movi        v31.16b, #0x01
 358     sqxtn       v16.8b,  v1.8h
 359     sqxtn2      v16.16b, v0.8h
 360     sqxtn       v17.8b,  v3.8h
 361     sqxtn2      v17.16b, v2.8h
 362     sqxtn       v18.8b,  v5.8h
 363     sqxtn2      v18.16b, v4.8h
 364     sqxtn       v19.8b,  v7.8h
 365     sqxtn2      v19.16b, v6.8h
 366     abs         v4.16b, v16.16b
 367     abs         v5.16b, v17.16b
 368     abs         v6.16b, v18.16b
 369     abs         v7.16b, v19.16b
 370     ld1        {v30.16b}, [x6]
 371     cmeq        v0.16b, v16.16b, #0
 372     cmeq        v1.16b, v17.16b, #0
 373     cmeq        v2.16b, v18.16b, #0
 374     cmeq        v3.16b, v19.16b, #0
 375     umax        v4.16b, v4.16b, v5.16b
 376     umax        v6.16b, v6.16b, v7.16b
 377     and         v0.16b, v0.16b, v30.16b
 378     and         v1.16b, v1.16b, v30.16b
 379     and         v2.16b, v2.16b, v30.16b
 380     and         v3.16b, v3.16b, v30.16b
 381     umax        v4.16b, v4.16b, v6.16b
 382     addp        v0.16b, v1.16b, v0.16b
 383     addp        v2.16b, v3.16b, v2.16b
 384     cmhi        v4.16b, v4.16b, v31.16b
 385     addp        v0.16b, v2.16b, v0.16b
 386     shrn        v4.8b,  v4.8h,  #4
 387     addp        v0.16b, v0.16b, v0.16b
 388     fmov        x2,  d4
 389     fmov        x1,  d0
 390     cbnz        x2,  9f
 391     mvn         x1,  x1
 392     mov         w0,  #0
 393     cbz         x1,  0f
 394     movrel      x5,  X(x264_decimate_table8)
 395 1:
 396     clz         x3,  x1
 397     lsl         x1,  x1,  x3
 398     ldrb        w7,  [x5, x3]
 399     lsl         x1,  x1,  #1
 400     add         w0,  w0,  w7
 401     cbnz        x1,  1b
 402     ret
 403 9:
 404     mov         w0,  #9
 405 0:
 406     ret
 407 endfunc
 408
 409 // int coeff_last( int16_t *l )
 410 function x264_coeff_last4_aarch64, export=1
 411     ldr         x2,  [x0]
 412     mov         w4,  #3
 413     clz         x0,  x2
 414     sub         w0,  w4,  w0, lsr #4
 415     ret
 416 endfunc
 417
 418 function x264_coeff_last8_aarch64, export=1
 419     ldr         x3,  [x0, #8]
 420     mov         w4,  #7
 421     clz         x2,  x3
 422     cmp         w2,  #64
 423     b.ne        1f
 424     ldr         x3,  [x0]
 425     sub         w4,  w4,  #4
 426     clz         x2,  x3
 427 1:
 428     sub         w0,  w4,  w2, lsr #4
 429     ret
 430 endfunc
 431
 432 .macro COEFF_LAST_1x size
 433 function x264_coeff_last\size\()_neon, export=1
 434 .if \size == 15
 435     sub         x0,  x0,  #2
 436 .endif
 437     ld1        {v0.8h,v1.8h}, [x0]
 438     uqxtn       v0.8b,  v0.8h
 439     uqxtn2      v0.16b, v1.8h
 440     cmtst       v0.16b, v0.16b, v0.16b
 441     shrn        v0.8b,  v0.8h,  #4
 442     fmov        x1,  d0
 443     mov         w3,  #\size - 1
 444     clz         x2,  x1
 445     sub         w0,  w3,  w2, lsr #2
 446     ret
 447 endfunc
 448 .endm
 449
 450 COEFF_LAST_1x 15
 451 COEFF_LAST_1x 16
 452
 453 function x264_coeff_last64_neon, export=1
 454     ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
 455     movi        v31.8h,  #8
 456     movi        v30.8h,  #1
 457     uqxtn       v0.8b,  v0.8h
 458     uqxtn2      v0.16b, v1.8h
 459     ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
 460     uqxtn       v1.8b,  v2.8h
 461     uqxtn2      v1.16b, v3.8h
 462     uqxtn       v2.8b,  v4.8h
 463     uqxtn2      v2.16b, v5.8h
 464     uqxtn       v3.8b,  v6.8h
 465     uqxtn2      v3.16b, v7.8h
 466
 467     cmtst       v0.16b, v0.16b, v0.16b
 468     cmtst       v1.16b, v1.16b, v1.16b
 469     cmtst       v2.16b, v2.16b, v2.16b
 470     cmtst       v3.16b, v3.16b, v3.16b
 471
 472     shrn        v0.8b,  v0.8h,  #4
 473     shrn2       v0.16b, v1.8h,  #4
 474     shrn        v1.8b,  v2.8h,  #4
 475     shrn2       v1.16b, v3.8h,  #4
 476
 477     clz         v0.4s,  v0.4s
 478     clz         v1.4s,  v1.4s
 479
 480     shrn        v0.4h,  v0.4s,  #2
 481     shrn2       v0.8h,  v1.4s,  #2
 482
 483     sub         v0.8h,  v31.8h,  v0.8h
 484     sshl        v0.8h,  v30.8h,  v0.8h
 485     shrn        v0.8b,  v0.8h,  #1
 486
 487     fmov        x2,  d0
 488     mov         w3,  #63
 489     clz         x2,  x2
 490     sub         w0,  w3,  w2
 491     ret
 492 endfunc
 493
 494 .macro coeff_level_run_start size
 495     add         x6,  x1,  #23            // runlevel->mask
 496     mov         w7,  #0
 497     mov         w8,  #0
 498     mov         w9,  #1
 499     and         x6,  x6,  #~15
 500     mov         w4,  #\size - 1
 501 .endm
 502
 503 .macro coeff_level_run shift
 504     clz         x3,  x2
 505     subs        w4,  w4,  w3, lsr #\shift
 506     str         w4,  [x1], #4
 507 1:
 508     ldrh        w5,  [x0, x4, lsl #1]
 509     strh        w5,  [x6], #2
 510     add         w7,  w7,  #1
 511     lsl         w10, w9, w4
 512     orr         w8,  w8,  w10
 513     b.le        2f
 514     add         w3,  w3,  #1 << \shift
 515     sub         w4,  w4,  #1
 516     and         x3,  x3,  #~((1 << \shift) - 1)
 517     lsl         x2,  x2,  x3
 518     clz         x3,  x2
 519     subs        w4,  w4,  w3, lsr #\shift
 520     b.ge        1b
 521 2:
 522     str         w8,  [x1]
 523     mov         w0,  w7
 524 .endm
 525
 526 function x264_coeff_level_run4_aarch64, export=1
 527     ldr         x2,  [x0]
 528
 529     coeff_level_run_start 4
 530
 531     coeff_level_run 4
 532
 533     ret
 534 endfunc
 535
 536 .macro X264_COEFF_LEVEL_RUN size
 537 function x264_coeff_level_run\size\()_neon, export=1
 538 .if \size == 15
 539     sub         x0,  x0,  #2
 540 .endif
 541 .if         \size < 15
 542     ld1         {v0.8h}, [x0]
 543     uqxtn       v0.8b,  v0.8h
 544     cmtst       v0.8b,  v0.8b,  v0.8b
 545 .else
 546     ld1         {v0.8h,v1.8h}, [x0]
 547     uqxtn       v0.8b,  v0.8h
 548     uqxtn2      v0.16b, v1.8h
 549     cmtst       v0.16b, v0.16b, v0.16b
 550     shrn        v0.8b,  v0.8h,  #4
 551 .endif
 552     fmov        x2,  d0
 553 .if \size == 15
 554     add         x0,  x0,  #2
 555 .endif
 556
 557     coeff_level_run_start \size
 558
 559     coeff_level_run (4 - (\size + 1) / 8)
 560
 561     ret
 562 endfunc
 563 .endm
 564
 565 X264_COEFF_LEVEL_RUN 8
 566 X264_COEFF_LEVEL_RUN 15
 567 X264_COEFF_LEVEL_RUN 16
 568
 569 function x264_denoise_dct_neon, export=1
 570 1:  subs        w3,  w3,  #16
 571     ld1         {v0.8h,v1.8h}, [x0]
 572     ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
 573     abs         v16.8h,  v0.8h
 574     abs         v17.8h,  v1.8h
 575     ld1         {v2.8h,v3.8h}, [x2], #32
 576     cmlt        v18.8h,  v0.8h,   #0
 577     cmlt        v19.8h,  v1.8h,   #0
 578     uaddw       v4.4s,   v4.4s,   v16.4h
 579     uaddw2      v5.4s,   v5.4s,   v16.8h
 580     uqsub       v20.8h,  v16.8h,  v2.8h
 581     uqsub       v21.8h,  v17.8h,  v3.8h
 582     uaddw       v6.4s,   v6.4s,   v17.4h
 583     uaddw2      v7.4s,   v7.4s,   v17.8h
 584     neg         v22.8h,  v20.8h
 585     neg         v23.8h,  v21.8h
 586     bsl         v18.16b, v22.16b, v20.16b
 587     bsl         v19.16b, v23.16b, v21.16b
 588     st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
 589     st1         {v18.8h,v19.8h}, [x0], #32
 590     b.gt        1b
 591     ret
 592 endfunc