git.sesse.net Git - x264/blob - common/aarch64/quant-a.S

   1 /****************************************************************************
   2  * quant.S: arm quantization and level-run
   3  *****************************************************************************
   4  * Copyright (C) 2009-2014 x264 project
   5  *
   6  * Authors: David Conrad <lessen42@gmail.com>
   7  *          Janne Grunau <janne-x264@jannau.net>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *
  23  * This program is also available under a commercial proprietary license.
  24  * For more information, contact us at licensing@x264.com.
  25  *****************************************************************************/
  26
  27 #include "asm.S"
  28
  29 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
  30     add         v18.8h, v18.8h, \bias0
  31     add         v19.8h, v19.8h, \bias1
  32     umull       v20.4s, v18.4h, \mf0_1\().4h
  33     umull2      v21.4s, v18.8h, \mf0_1\().8h
  34     umull       v22.4s, v19.4h, \mf2_3\().4h
  35     umull2      v23.4s, v19.8h, \mf2_3\().8h
  36     sshr        v16.8h, v16.8h, #15
  37     sshr        v17.8h, v17.8h, #15
  38     shrn        v18.4h, v20.4s, #16
  39     shrn2       v18.8h, v21.4s, #16
  40     shrn        v19.4h, v22.4s, #16
  41     shrn2       v19.8h, v23.4s, #16
  42     eor         v18.16b, v18.16b, v16.16b
  43     eor         v19.16b, v19.16b, v17.16b
  44     sub         v18.8h, v18.8h, v16.8h
  45     sub         v19.8h, v19.8h, v17.8h
  46     orr         \mask,  v18.16b, v19.16b
  47     st1        {v18.8h,v19.8h}, [x0], #32
  48 .endm
  49
  50 .macro QUANT_END d
  51     fmov        x2,  \d
  52     mov         w0,  #0
  53     tst         x2,  x2
  54     cinc        w0,  w0,  ne
  55     ret
  56 .endm
  57
  58 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
  59 function x264_quant_2x2_dc_neon, export=1
  60     ld1        {v0.4h}, [x0]
  61     dup         v2.4h,  w2
  62     dup         v1.4h,  w1
  63     abs         v3.4h,  v0.4h
  64     add         v3.4h,  v3.4h,  v2.4h
  65     umull       v3.4s,  v3.4h,  v1.4h
  66     sshr        v0.4h,  v0.4h,  #15
  67     shrn        v3.4h,  v3.4s,  #16
  68     eor         v3.8b,  v3.8b,  v0.8b
  69     sub         v3.4h,  v3.4h,  v0.4h
  70     st1        {v3.4h}, [x0]
  71     QUANT_END   d3
  72 endfunc
  73
  74 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
  75 function x264_quant_4x4_dc_neon, export=1
  76     ld1        {v16.8h,v17.8h}, [x0]
  77     abs         v18.8h,  v16.8h
  78     abs         v19.8h,  v17.8h
  79     dup         v0.8h,  w2
  80     dup         v2.8h,  w1
  81     QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
  82     uqxtn       v0.8b,  v0.8h
  83     QUANT_END   d0
  84 endfunc
  85
  86 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  87 function x264_quant_4x4_neon, export=1
  88     ld1        {v16.8h,v17.8h}, [x0]
  89     abs         v18.8h,  v16.8h
  90     abs         v19.8h,  v17.8h
  91     ld1        {v0.8h,v1.8h}, [x2]
  92     ld1        {v2.8h,v3.8h}, [x1]
  93     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
  94     uqxtn       v0.8b,  v0.8h
  95     QUANT_END   d0
  96 endfunc
  97
  98 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
  99 function x264_quant_4x4x4_neon, export=1
 100     ld1        {v16.8h,v17.8h}, [x0]
 101     abs         v18.8h, v16.8h
 102     abs         v19.8h, v17.8h
 103     ld1        {v0.8h,v1.8h}, [x2]
 104     ld1        {v2.8h,v3.8h}, [x1]
 105     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
 106     ld1        {v16.8h,v17.8h}, [x0]
 107     abs         v18.8h, v16.8h
 108     abs         v19.8h, v17.8h
 109     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
 110     ld1        {v16.8h,v17.8h}, [x0]
 111     abs         v18.8h, v16.8h
 112     abs         v19.8h, v17.8h
 113     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
 114     ld1        {v16.8h,v17.8h}, [x0]
 115     abs         v18.8h, v16.8h
 116     abs         v19.8h, v17.8h
 117     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
 118     uqxtn       v4.8b,  v4.8h
 119     uqxtn       v7.8b,  v7.8h
 120     uqxtn       v6.8b,  v6.8h
 121     uqxtn       v5.8b,  v5.8h
 122     fmov        x7,  d7
 123     fmov        x6,  d6
 124     fmov        x5,  d5
 125     fmov        x4,  d4
 126     mov         w0,  #0
 127     tst         x7,  x7
 128     cinc        w0,  w0,  ne
 129     lsl         w0,  w0,  #1
 130     tst         x6,  x6
 131     cinc        w0,  w0,  ne
 132     lsl         w0,  w0,  #1
 133     tst         x5,  x5
 134     cinc        w0,  w0,  ne
 135     lsl         w0,  w0,  #1
 136     tst         x4,  x4
 137     cinc        w0,  w0,  ne
 138     ret
 139 endfunc
 140
 141 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 142 function x264_quant_8x8_neon, export=1
 143     ld1        {v16.8h,v17.8h}, [x0]
 144     abs         v18.8h, v16.8h
 145     abs         v19.8h, v17.8h
 146     ld1        {v0.8h,v1.8h}, [x2], #32
 147     ld1        {v2.8h,v3.8h}, [x1], #32
 148     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
 149 .rept 3
 150     ld1        {v16.8h,v17.8h}, [x0]
 151     abs         v18.8h, v16.8h
 152     abs         v19.8h, v17.8h
 153     ld1        {v0.8h,v1.8h}, [x2], #32
 154     ld1        {v2.8h,v3.8h}, [x1], #32
 155     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
 156     orr         v4.16b, v4.16b, v5.16b
 157 .endr
 158     uqxtn       v0.8b,  v4.8h
 159     QUANT_END   d0
 160 endfunc
 161
 162 .macro DEQUANT_START mf_size offset dc=no
 163     mov         w3,  #0x2b
 164     mul         w3,  w3,  w2
 165     lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
 166     add         w5,  w3,  w3,  lsl #1
 167     sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
 168     lsl         w2,  w2,  #\mf_size
 169 .ifc \dc,no
 170     add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
 171 .else
 172     ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
 173 .endif
 174     subs        w3,  w3,  #\offset      // 6 for 8x8
 175 .endm
 176
 177 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 178 .macro DEQUANT size bits
 179 function x264_dequant_\size\()_neon, export=1
 180     DEQUANT_START \bits+2, \bits
 181 .ifc \size, 8x8
 182     mov         w2,  #4
 183 .endif
 184     b.lt        dequant_\size\()_rshift
 185
 186     dup         v31.8h, w3
 187 dequant_\size\()_lshift_loop:
 188 .ifc \size, 8x8
 189     subs        w2,  w2,  #1
 190 .endif
 191     ld1        {v16.4s}, [x1], #16
 192     ld1        {v17.4s}, [x1], #16
 193     sqxtn       v2.4h,  v16.4s
 194     ld1        {v18.4s}, [x1], #16
 195     sqxtn2      v2.8h,  v17.4s
 196     ld1        {v19.4s}, [x1], #16
 197     sqxtn       v3.4h,  v18.4s
 198     ld1        {v0.8h,v1.8h}, [x0]
 199     sqxtn2      v3.8h,  v19.4s
 200     mul         v0.8h,  v0.8h,  v2.8h
 201     mul         v1.8h,  v1.8h,  v3.8h
 202     sshl        v0.8h,  v0.8h,  v31.8h
 203     sshl        v1.8h,  v1.8h,  v31.8h
 204     st1        {v0.8h,v1.8h}, [x0], #32
 205 .ifc \size, 8x8
 206     b.gt        dequant_\size\()_lshift_loop
 207 .endif
 208     ret
 209
 210 dequant_\size\()_rshift:
 211     dup         v31.4s, w3
 212     neg         w3,  w3
 213     mov         w5,  #1
 214     sub         w3,  w3,  #1
 215     lsl         w5,  w5,  w3
 216
 217 .ifc \size, 8x8
 218 dequant_\size\()_rshift_loop:
 219     subs        w2,  w2,  #1
 220 .endif
 221     ld1        {v16.4s}, [x1], #16
 222     ld1        {v17.4s}, [x1], #16
 223     sqxtn       v2.4h,  v16.4s
 224     ld1        {v18.4s}, [x1], #16
 225     dup         v16.4s, w5
 226     sqxtn2      v2.8h,  v17.4s
 227     ld1        {v19.4s}, [x1], #16
 228     dup         v17.4s, w5
 229     sqxtn       v3.4h,  v18.4s
 230     ld1        {v0.8h,v1.8h}, [x0]
 231     dup         v18.4s, w5
 232     sqxtn2      v3.8h,  v19.4s
 233     dup         v19.4s, w5
 234
 235     smlal       v16.4s, v0.4h,  v2.4h
 236     smlal2      v17.4s, v0.8h,  v2.8h
 237     smlal       v18.4s, v1.4h,  v3.4h
 238     smlal2      v19.4s, v1.8h,  v3.8h
 239     sshl        v16.4s, v16.4s, v31.4s
 240     sshl        v17.4s, v17.4s, v31.4s
 241     sshl        v18.4s, v18.4s, v31.4s
 242     sshl        v19.4s, v19.4s, v31.4s
 243
 244     sqxtn       v0.4h,  v16.4s
 245     sqxtn2      v0.8h,  v17.4s
 246     sqxtn       v1.4h,  v18.4s
 247     sqxtn2      v1.8h,  v19.4s
 248     st1        {v0.8h,v1.8h}, [x0], #32
 249 .ifc \size, 8x8
 250     b.gt        dequant_\size\()_rshift_loop
 251 .endif
 252     ret
 253 endfunc
 254 .endm
 255
 256 DEQUANT 4x4, 4
 257 DEQUANT 8x8, 6
 258
 259 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 260 function x264_dequant_4x4_dc_neon, export=1
 261     DEQUANT_START 6, 6, yes
 262     b.lt        dequant_4x4_dc_rshift
 263
 264     lsl         w1,  w1,  w3
 265     dup         v2.8h,  w1
 266     ld1        {v0.8h,v1.8h},   [x0]
 267
 268     mul         v0.8h,  v0.8h,  v2.8h
 269     mul         v1.8h,  v1.8h,  v2.8h
 270     st1        {v0.8h,v1.8h},   [x0]
 271     ret
 272
 273 dequant_4x4_dc_rshift:
 274     dup         v4.8h,  w1
 275     dup         v3.4s, w3
 276     neg         w3,  w3
 277     mov         w5,  #1
 278     sub         w3,  w3,  #1
 279     lsl         w5,  w5,  w3
 280
 281     dup         v16.4s, w5
 282     dup         v17.4s, w5
 283     ld1        {v0.8h,v1.8h}, [x0]
 284     dup         v18.4s, w5
 285     dup         v19.4s, w5
 286
 287     smlal       v16.4s, v0.4h,  v4.4h
 288     smlal2      v17.4s, v0.8h,  v4.8h
 289     smlal       v18.4s, v1.4h,  v4.4h
 290     smlal2      v19.4s, v1.8h,  v4.8h
 291     sshl        v16.4s, v16.4s, v3.4s
 292     sshl        v17.4s, v17.4s, v3.4s
 293     sshl        v18.4s, v18.4s, v3.4s
 294     sshl        v19.4s, v19.4s, v3.4s
 295
 296     sqxtn       v0.4h,  v16.4s
 297     sqxtn2      v0.8h,  v17.4s
 298     sqxtn       v1.4h,  v18.4s
 299     sqxtn2      v1.8h,  v19.4s
 300     st1        {v0.8h,v1.8h}, [x0]
 301     ret
 302 endfunc
 303
 304 .macro decimate_score_1x size
 305 function x264_decimate_score\size\()_neon, export=1
 306     ld1        {v0.8h,v1.8h}, [x0]
 307     movrel      x5,  X(x264_decimate_table4)
 308     movi        v3.16b, #0x01
 309     sqxtn       v0.8b,  v0.8h
 310     sqxtn2      v0.16b, v1.8h
 311     abs         v2.16b, v0.16b
 312     cmeq        v1.16b, v0.16b, #0
 313     cmhi        v2.16b, v2.16b, v3.16b
 314     shrn        v1.8b,  v1.8h,  #4
 315     shrn        v2.8b,  v2.8h,  #4
 316     fmov        x2,  d2
 317     fmov        x1,  d1
 318     cbnz        x2,  9f
 319     mvn         x1,  x1
 320     mov         w0,  #0
 321     cbz         x1,  0f
 322 .ifc \size, 15
 323     lsr         x1,  x1,  #1
 324 .endif
 325     rbit        x1,  x1
 326 1:
 327     clz         x3,  x1
 328     lsr         x6,  x3,  #2
 329     lsl         x1,  x1,  x3
 330     ldrb        w7,  [x5, x6]
 331     cbz         x1,  2f
 332     lsl         x1,  x1,  #4
 333     add         w0,  w0,  w7
 334     cbnz        x1,  1b
 335     ret
 336 2:
 337     add         w0,  w0,  w7
 338 0:
 339     ret
 340 9:
 341     mov         w0,  #9
 342     ret
 343 endfunc
 344 .endm
 345
 346 decimate_score_1x 15
 347 decimate_score_1x 16
 348
 349 const mask64, align=6
 350     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
 351     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
 352 endconst
 353
 354 function x264_decimate_score64_neon, export=1
 355     ld1        {v0.8h,v1.8h}, [x0], #32
 356     ld1        {v2.8h,v3.8h}, [x0], #32
 357     ld1        {v4.8h,v5.8h}, [x0], #32
 358     ld1        {v6.8h,v7.8h}, [x0]
 359     movrel      x6,  mask64
 360     movi        v31.16b, #0x01
 361     sqxtn       v16.8b,  v1.8h
 362     sqxtn2      v16.16b, v0.8h
 363     sqxtn       v17.8b,  v3.8h
 364     sqxtn2      v17.16b, v2.8h
 365     sqxtn       v18.8b,  v5.8h
 366     sqxtn2      v18.16b, v4.8h
 367     sqxtn       v19.8b,  v7.8h
 368     sqxtn2      v19.16b, v6.8h
 369     abs         v4.16b, v16.16b
 370     abs         v5.16b, v17.16b
 371     abs         v6.16b, v18.16b
 372     abs         v7.16b, v19.16b
 373     ld1        {v30.16b}, [x6]
 374     cmeq        v0.16b, v16.16b, #0
 375     cmeq        v1.16b, v17.16b, #0
 376     cmeq        v2.16b, v18.16b, #0
 377     cmeq        v3.16b, v19.16b, #0
 378     umax        v4.16b, v4.16b, v5.16b
 379     umax        v6.16b, v6.16b, v7.16b
 380     and         v0.16b, v0.16b, v30.16b
 381     and         v1.16b, v1.16b, v30.16b
 382     and         v2.16b, v2.16b, v30.16b
 383     and         v3.16b, v3.16b, v30.16b
 384     umax        v4.16b, v4.16b, v6.16b
 385     addp        v0.16b, v1.16b, v0.16b
 386     addp        v2.16b, v3.16b, v2.16b
 387     cmhi        v4.16b, v4.16b, v31.16b
 388     addp        v0.16b, v2.16b, v0.16b
 389     shrn        v4.8b,  v4.8h,  #4
 390     addp        v0.16b, v0.16b, v0.16b
 391     fmov        x2,  d4
 392     fmov        x1,  d0
 393     cbnz        x2,  9f
 394     mvn         x1,  x1
 395     mov         w0,  #0
 396     cbz         x1,  0f
 397     movrel      x5,  X(x264_decimate_table8)
 398 1:
 399     clz         x3,  x1
 400     lsl         x1,  x1,  x3
 401     ldrb        w7,  [x5, x3]
 402     cbz         x1,  2f
 403     lsl         x1,  x1,  #1
 404     add         w0,  w0,  w7
 405     cbnz        x1,  1b
 406     ret
 407 2:
 408     add         w0,  w0,  w7
 409 0:
 410     ret
 411 9:
 412     mov         w0,  #9
 413     ret
 414 endfunc
 415
 416 // int coeff_last( int16_t *l )
 417 function x264_coeff_last4_aarch64, export=1
 418     ldr         x2,  [x0]
 419     mov         w4,  #3
 420     clz         x0,  x2
 421     sub         w0,  w4,  w0, lsr #4
 422     ret
 423 endfunc
 424
 425 function x264_coeff_last8_aarch64, export=1
 426     ldr         x3,  [x0, #8]
 427     mov         w4,  #7
 428     clz         x2,  x3
 429     cmp         w2,  #64
 430     b.ne        1f
 431     ldr         x3,  [x0]
 432     sub         w4,  w4,  #4
 433     clz         x2,  x3
 434 1:
 435     sub         w0,  w4,  w2, lsr #4
 436     ret
 437 endfunc
 438
 439 .macro COEFF_LAST_1x size
 440 function x264_coeff_last\size\()_neon, export=1
 441 .if \size == 15
 442     sub         x0,  x0,  #2
 443 .endif
 444     ld1        {v0.8h,v1.8h}, [x0]
 445     uqxtn       v0.8b,  v0.8h
 446     uqxtn2      v0.16b, v1.8h
 447     cmtst       v0.16b, v0.16b, v0.16b
 448     shrn        v0.8b,  v0.8h,  #4
 449     fmov        x1,  d0
 450     mov         w3,  #\size - 1
 451     clz         x2,  x1
 452     sub         w0,  w3,  w2, lsr #2
 453     ret
 454 endfunc
 455 .endm
 456
 457 COEFF_LAST_1x 15
 458 COEFF_LAST_1x 16
 459
 460 function x264_coeff_last64_neon, export=1
 461     ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
 462     movi        v31.8h,  #8
 463     movi        v30.8h,  #1
 464     uqxtn       v0.8b,  v0.8h
 465     uqxtn2      v0.16b, v1.8h
 466     ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
 467     uqxtn       v1.8b,  v2.8h
 468     uqxtn2      v1.16b, v3.8h
 469     uqxtn       v2.8b,  v4.8h
 470     uqxtn2      v2.16b, v5.8h
 471     uqxtn       v3.8b,  v6.8h
 472     uqxtn2      v3.16b, v7.8h
 473
 474     cmtst       v0.16b, v0.16b, v0.16b
 475     cmtst       v1.16b, v1.16b, v1.16b
 476     cmtst       v2.16b, v2.16b, v2.16b
 477     cmtst       v3.16b, v3.16b, v3.16b
 478
 479     shrn        v0.8b,  v0.8h,  #4
 480     shrn2       v0.16b, v1.8h,  #4
 481     shrn        v1.8b,  v2.8h,  #4
 482     shrn2       v1.16b, v3.8h,  #4
 483
 484     clz         v0.4s,  v0.4s
 485     clz         v1.4s,  v1.4s
 486
 487     shrn        v0.4h,  v0.4s,  #2
 488     shrn2       v0.8h,  v1.4s,  #2
 489
 490     sub         v0.8h,  v31.8h,  v0.8h
 491     sshl        v0.8h,  v30.8h,  v0.8h
 492     shrn        v0.8b,  v0.8h,  #1
 493
 494     fmov        x2,  d0
 495     mov         w3,  #63
 496     clz         x2,  x2
 497     sub         w0,  w3,  w2
 498     ret
 499 endfunc