X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;ds=sidebyside;f=libavcodec%2Faarch64%2Fvp8dsp_neon.S;h=6a727e717c39a5cd5db384eae41b90d1901b8e3f;hb=79025da3f2e7ab047c8f3c0c817952a98480b26b;hp=fbe064e9aaaa70469ccf51ea69c086d304a84892;hpb=c950beb68dee016e0e0a1b729d40abf700d32d1a;p=ffmpeg diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index fbe064e9aaa..6a727e717c3 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -4,6 +4,7 @@ * Copyright (c) 2010 Rob Clark * Copyright (c) 2011 Mans Rullgard * Copyright (c) 2018 Magnus Röös + * Copyright (c) 2019 Martin Storsjo * * This file is part of FFmpeg. * @@ -25,10 +26,66 @@ #include "libavutil/aarch64/asm.S" #include "neon.S" +function ff_vp8_luma_dc_wht_neon, export=1 + ld1 {v0.4h - v3.4h}, [x1] + movi v30.8h, #0 + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + st1 {v30.8h}, [x1], #16 + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + st1 {v30.8h}, [x1] + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + movi v16.4h, #3 + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v0.4h, v0.4h, v16.4h + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + sshr v0.4h, v0.4h, #3 + sshr v1.4h, v1.4h, #3 + sshr v2.4h, v2.4h, #3 + sshr v3.4h, v3.4h, #3 + + mov x3, #32 + st1 {v0.h}[0], [x0], x3 + st1 {v1.h}[0], [x0], x3 + st1 {v2.h}[0], [x0], x3 + st1 {v3.h}[0], [x0], x3 + st1 {v0.h}[1], [x0], x3 + st1 {v1.h}[1], [x0], x3 + st1 {v2.h}[1], [x0], x3 + st1 {v3.h}[1], [x0], x3 + st1 {v0.h}[2], [x0], x3 + st1 {v1.h}[2], [x0], x3 + st1 {v2.h}[2], [x0], x3 + st1 {v3.h}[2], [x0], x3 + st1 {v0.h}[3], [x0], x3 + st1 {v1.h}[3], [x0], x3 + st1 {v2.h}[3], [x0], x3 + st1 {v3.h}[3], [x0], x3 + + ret +endfunc + function ff_vp8_idct_add_neon, export=1 ld1 {v0.8b - v3.8b}, [x1] mov w4, #20091 - movk w4, #35468/2, lsl 16 + movk w4, #35468/2, lsl #16 dup v4.2s, w4 smull v26.4s, v1.4h, v4.h[0] @@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1 ret endfunc +function ff_vp8_idct_dc_add4uv_neon, export=1 + movi v0.4h, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + mov x3, x0 + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.8b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.8b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.8b}, [x0], x2 + uaddw v0.8h, v16.8h, v1.8b + ld1 {v3.8b}, [x0], x2 + uaddw v22.8h, v16.8h, v2.8b + ld1 {v4.8b}, [x0], x2 + uaddw v2.8h, v16.8h, v3.8b + ld1 {v5.8b}, [x0], x2 + uaddw v24.8h, v18.8h, v4.8b + ld1 {v6.8b}, [x0], x2 + uaddw v4.8h, v18.8h, v5.8b + ld1 {v7.8b}, [x0], x2 + uaddw v26.8h, v18.8h, v6.8b + sqxtun v20.8b, v20.8h + uaddw v6.8h, v18.8h, v7.8b + sqxtun v21.8b, v0.8h + sqxtun v22.8b, v22.8h + st1 {v20.8b}, [x3], x2 + sqxtun v23.8b, v2.8h + st1 {v21.8b}, [x3], x2 + sqxtun v24.8b, v24.8h + st1 {v22.8b}, [x3], x2 + sqxtun v25.8b, v4.8h + st1 {v23.8b}, [x3], x2 + sqxtun v26.8b, v26.8h + st1 {v24.8b}, [x3], x2 + sqxtun v27.8b, v6.8h + st1 {v25.8b}, [x3], x2 + st1 {v26.8b}, [x3], x2 + st1 {v27.8b}, [x3], x2 + + ret +endfunc + function ff_vp8_idct_dc_add4y_neon, export=1 movi v0.16b, #0 mov x3, #32 @@ -581,7 +690,7 @@ function ff_put_vp8_pixels16_neon, export=1 st1 {v1.16b}, [x0], x1 st1 {v2.16b}, [x0], x1 st1 {v3.16b}, [x0], x1 - bgt 1b + b.gt 1b ret endfunc @@ -596,7 +705,7 @@ function ff_put_vp8_pixels8_neon, export=1 st1 {v0.d}[1], [x0], x1 st1 {v1.8b}, [x0], x1 st1 {v1.d}[1], [x0], x1 - bgt 1b + b.gt 1b ret endfunc @@ -743,7 +852,7 @@ endfunc // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit -// arithmatic can be used to apply filters +// arithmetic can be used to apply filters const subpel_filters, align=4 .short 0, 6, 123, 12, 1, 0, 0, 0 .short 2, 11, 108, 36, 8, 1, 0, 0 @@ -778,7 +887,7 @@ function ff_put_vp8_epel16_v6_neon, export=1 st1 {v1.1d - v2.1d}, [x0], x1 st1 {v3.1d - v4.1d}, [x0], x1 subs x4, x4, #2 - bne 1b + b.ne 1b ret endfunc @@ -797,7 +906,7 @@ function ff_put_vp8_epel16_h6_neon, export=1 st1 {v1.16b}, [x0], x1 subs w4, w4, #1 - bne 1b + b.ne 1b ret endfunc @@ -821,7 +930,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 vp8_epel16_h6 v1, v1, v2 st1 {v1.16b}, [x7], #16 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): @@ -842,7 +951,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 st1 {v2.16b}, [x0], x1 subs x4, x4, #1 - bne 2b + b.ne 2b add sp, sp, #336+16 ret @@ -869,7 +978,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -888,7 +997,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret @@ -915,7 +1024,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -934,7 +1043,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret @@ -962,7 +1071,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -979,7 +1088,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret @@ -1007,7 +1116,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -1024,7 +1133,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret