X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Faarch64%2Fvp9itxfm_neon.S;h=2c3c002d54c9cae1c8533abf0f9f194cb4763a0c;hb=9821aa7d3890e4f8e4e43968953e847a21d49839;hp=7ce3116a14d40ddcc216d62a1af186246b6f94bb;hpb=1398ded7a77b2057a3e9983e3df512855b7ff0eb;p=ffmpeg diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 7ce3116a14d..2c3c002d54c 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -22,7 +22,7 @@ #include "neon.S" const itxfm4_coeffs, align=4 - .short 11585, 6270, 15137, 0 + .short 11585, 0, 6270, 15137 iadst4_coeffs: .short 5283, 15212, 9929, 13377 endconst @@ -30,19 +30,19 @@ endconst const iadst8_coeffs, align=4 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 idct_coeffs: - .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606 - .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0 + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 endconst const iadst16_coeffs, align=4 - .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 - .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 endconst -// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 -// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14 +// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14 // in/out are .8h registers; this can do with 4 temp registers, but is // more efficient if 6 temp registers are available. .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 @@ -75,6 +75,17 @@ endconst .endif .endm +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 + smull \tmp1\().4s, \in1\().4h, v0.h[0] + smull2 \tmp2\().4s, \in1\().8h, v0.h[0] + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + rshrn \out2\().4h, \tmp1\().4s, #14 + rshrn2 \out2\().8h, \tmp2\().4s, #14 +.endm + // out1,out2 = in1 * coef1 - in2 * coef2 // out3,out4 = in1 * coef2 + in2 * coef1 // out are 4 x .4s registers, in are 2 x .8h registers @@ -104,6 +115,43 @@ endconst rshrn2 \inout2\().8h, \tmp4\().4s, #14 .endm +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout1\().4h, \coef1 + smull2 \tmp2\().4s, \inout1\().8h, \coef1 + smull \tmp3\().4s, \inout1\().4h, \coef2 + smull2 \tmp4\().4s, \inout1\().8h, \coef2 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout2\().4h, \coef2 + smull2 \tmp2\().4s, \inout2\().8h, \coef2 + smull \tmp3\().4s, \inout2\().4h, \coef1 + smull2 \tmp4\().4s, \inout2\().8h, \coef1 + neg \tmp1\().4s, \tmp1\().4s + neg \tmp2\().4s, \tmp2\().4s + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef + smull \out1\().4s, \in\().4h, \coef + smull2 \out2\().4s, \in\().8h, \coef +.endm + +.macro drshrn_h out, in1, in2, shift + rshrn \out\().4h, \in1\().4s, \shift + rshrn2 \out\().8h, \in2\().4s, \shift +.endm + + // out1 = in1 + in2 // out2 = in1 - in2 .macro butterfly_8h out1, out2, in1, in2 @@ -144,14 +192,14 @@ endconst .endm .macro idct4 c0, c1, c2, c3 - smull v22.4s, \c1\().4h, v0.h[2] - smull v20.4s, \c1\().4h, v0.h[1] + smull v22.4s, \c1\().4h, v0.h[3] + smull v20.4s, \c1\().4h, v0.h[2] add v16.4h, \c0\().4h, \c2\().4h sub v17.4h, \c0\().4h, \c2\().4h - smlal v22.4s, \c3\().4h, v0.h[1] + smlal v22.4s, \c3\().4h, v0.h[2] smull v18.4s, v16.4h, v0.h[0] smull v19.4s, v17.4h, v0.h[0] - smlsl v20.4s, \c3\().4h, v0.h[2] + smlsl v20.4s, \c3\().4h, v0.h[3] rshrn v22.4h, v22.4s, #14 rshrn v18.4h, v18.4s, #14 rshrn v19.4h, v19.4s, #14 @@ -177,7 +225,7 @@ endconst add v21.4s, v17.4s, v19.4s rshrn \c0\().4h, v20.4s, #14 add v16.4s, v16.4s, v17.4s - rshrn \c1\().4h, v21.4s, #14 + rshrn \c1\().4h, v21.4s, #14 sub v16.4s, v16.4s, v19.4s rshrn \c2\().4h, v18.4s, #14 rshrn \c3\().4h, v16.4s, #14 @@ -204,10 +252,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 movi v31.8h, #0 .ifc \txfm1\()_\txfm2,idct_idct - cmp x3, #1 + cmp w3, #1 b.ne 1f // DC-only for idct/idct - ld1r {v2.4h}, [x2] + ld1 {v2.h}[0], [x2] smull v2.4s, v2.4h, v0.h[0] rshrn v2.4h, v2.4s, #14 smull v2.4s, v2.4h, v0.h[0] @@ -239,8 +287,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 \txfm2\()4 v4, v5, v6, v7 2: - ld1r {v0.2s}, [x0], x1 - ld1r {v1.2s}, [x0], x1 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[0], [x0], x1 .ifnc \txfm1,iwht srshr v4.4h, v4.4h, #4 srshr v5.4h, v5.4h, #4 @@ -249,8 +297,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 .endif uaddw v4.8h, v4.8h, v0.8b uaddw v5.8h, v5.8h, v1.8b - ld1r {v2.2s}, [x0], x1 - ld1r {v3.2s}, [x0], x1 + ld1 {v2.s}[0], [x0], x1 + ld1 {v3.s}[0], [x0], x1 sqxtun v0.8b, v4.8h sqxtun v1.8b, v5.8h sub x0, x0, x1, lsl #2 @@ -278,9 +326,9 @@ itxfm_func4x4 iwht, iwht .macro idct8 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a - dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a - dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a - dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a + dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a + dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a + dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a @@ -313,8 +361,8 @@ itxfm_func4x4 iwht, iwht dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] neg v19.8h, v19.8h // v19 = out[3] - dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a - dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a + dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a + dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 @@ -331,23 +379,22 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 // idct, so those always need to be loaded. .ifc \txfm1\()_\txfm2,idct_idct movrel x4, idct_coeffs - ld1 {v0.8h}, [x4] .else movrel x4, iadst8_coeffs ld1 {v1.8h}, [x4], #16 - ld1 {v0.8h}, [x4] .endif + ld1 {v0.8h}, [x4] - movi v2.16b, #0 - movi v3.16b, #0 - movi v4.16b, #0 - movi v5.16b, #0 + movi v2.8h, #0 + movi v3.8h, #0 + movi v4.8h, #0 + movi v5.8h, #0 .ifc \txfm1\()_\txfm2,idct_idct - cmp x3, #1 + cmp w3, #1 b.ne 1f // DC-only for idct/idct - ld1r {v2.4h}, [x2] + ld1 {v2.h}[0], [x2] smull v2.4s, v2.4h, v0.h[0] rshrn v2.4h, v2.4s, #14 smull v2.4s, v2.4h, v0.h[0] @@ -364,11 +411,11 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 b 2f .endif 1: - ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x2], #64 - ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x2], #64 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64 sub x2, x2, #128 - st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64 - st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64 + st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 + st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 \txfm1\()8 @@ -438,7 +485,7 @@ function idct16x16_dc_add_neon movi v1.4h, #0 - ld1r {v2.4h}, [x2] + ld1 {v2.h}[0], [x2] smull v2.4s, v2.4h, v0.h[0] rshrn v2.4h, v2.4s, #14 smull v2.4s, v2.4h, v0.h[0] @@ -448,44 +495,29 @@ function idct16x16_dc_add_neon srshr v2.8h, v2.8h, #6 + mov x3, x0 mov x4, #16 1: // Loop to add the constant from v2 into all 16x16 outputs - ld1 {v3.16b}, [x0] - uaddw v4.8h, v2.8h, v3.8b - uaddw2 v5.8h, v2.8h, v3.16b - sqxtun v4.8b, v4.8h - sqxtun2 v4.16b, v5.8h - st1 {v4.16b}, [x0], x1 - subs x4, x4, #1 + subs x4, x4, #2 + ld1 {v3.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + uaddw v16.8h, v2.8h, v3.8b + uaddw2 v17.8h, v2.8h, v3.16b + uaddw v18.8h, v2.8h, v4.8b + uaddw2 v19.8h, v2.8h, v4.16b + sqxtun v3.8b, v16.8h + sqxtun2 v3.16b, v17.8h + sqxtun v4.8b, v18.8h + sqxtun2 v4.16b, v19.8h + st1 {v3.16b}, [x3], x1 + st1 {v4.16b}, [x3], x1 b.ne 1b ret endfunc -.macro idct16 - dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a - dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a - dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a - dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a - dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a - dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a - dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a - dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a - - butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 - butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 - butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 - butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 - butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 - butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 - butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 - butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 - - dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a - dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a - dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a - +.macro idct16_end butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 @@ -506,40 +538,124 @@ endfunc butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12] butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] + ret .endm -.macro iadst16 +function idct16 + dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_half + dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_quarter + dsmull_h v24, v25, v19, v1.h[7] + dsmull_h v4, v5, v17, v1.h[0] + dsmull_h v7, v6, v18, v0.h[5] + dsmull_h v30, v31, v18, v0.h[4] + neg v24.4s, v24.4s + neg v25.4s, v25.4s + dsmull_h v29, v28, v17, v1.h[1] + dsmull_h v26, v27, v19, v1.h[6] + dsmull_h v22, v23, v16, v0.h[0] + drshrn_h v24, v24, v25, #14 + drshrn_h v16, v4, v5, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v6, v30, v31, #14 + drshrn_h v29, v29, v28, #14 + drshrn_h v17, v26, v27, #14 + drshrn_h v28, v22, v23, #14 + + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3] + neg v22.4s, v22.4s + neg v23.4s, v23.4s + drshrn_h v27, v20, v21, #14 + drshrn_h v21, v22, v23, #14 + drshrn_h v23, v18, v19, #14 + drshrn_h v25, v30, v31, #14 + mov v4.16b, v28.16b + mov v5.16b, v28.16b + dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 + mov v20.16b, v28.16b + idct16_end +endfunc + +function iadst16 ld1 {v0.8h,v1.8h}, [x11] dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 - dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a - dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10 + dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a - dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4 + dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a - dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6 + dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 ld1 {v0.8h}, [x10] dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a - dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8 + dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a - dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13 + dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a - dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10 + dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a - dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a @@ -547,15 +663,15 @@ endfunc butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 - dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12 - dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15 + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a neg v29.8h, v29.8h // v29 = out[13] - dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a - dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 @@ -577,7 +693,8 @@ endfunc mov v16.16b, v2.16b mov v30.16b, v4.16b -.endm + ret +endfunc // Helper macros; we can't use these expressions directly within // e.g. .irp due to the extra concatenation \(). Therefore wrap @@ -588,26 +705,75 @@ endfunc .macro store i, dst, inc st1 {v\i\().8h}, [\dst], \inc .endm +.macro movi_v i, size, imm + movi v\i\()\size, \imm +.endm .macro load_clear i, src, inc ld1 {v\i\().8h}, [\src] st1 {v2.8h}, [\src], \inc .endm +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 + srshr \coef0, \coef0, #6 + ld1 {v2.8b}, [x0], x1 + srshr \coef1, \coef1, #6 + ld1 {v3.8b}, [x3], x1 + srshr \coef2, \coef2, #6 + ld1 {v4.8b}, [x0], x1 + srshr \coef3, \coef3, #6 + uaddw \coef0, \coef0, v2.8b + ld1 {v5.8b}, [x3], x1 + uaddw \coef1, \coef1, v3.8b + srshr \coef4, \coef4, #6 + ld1 {v6.8b}, [x0], x1 + srshr \coef5, \coef5, #6 + ld1 {v7.8b}, [x3], x1 + sqxtun v2.8b, \coef0 + srshr \coef6, \coef6, #6 + sqxtun v3.8b, \coef1 + srshr \coef7, \coef7, #6 + uaddw \coef2, \coef2, v4.8b + ld1 {\tmp1}, [x0], x1 + uaddw \coef3, \coef3, v5.8b + ld1 {\tmp2}, [x3], x1 + sqxtun v4.8b, \coef2 + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + sqxtun v5.8b, \coef3 + uaddw \coef4, \coef4, v6.8b + st1 {v2.8b}, [x0], x1 + uaddw \coef5, \coef5, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v6.8b, \coef4 + st1 {v4.8b}, [x0], x1 + sqxtun v7.8b, \coef5 + st1 {v5.8b}, [x3], x1 + uaddw \coef6, \coef6, \tmp1 + st1 {v6.8b}, [x0], x1 + uaddw \coef7, \coef7, \tmp2 + st1 {v7.8b}, [x3], x1 + sqxtun \tmp1, \coef6 + sqxtun \tmp2, \coef7 + st1 {\tmp1}, [x0], x1 + st1 {\tmp2}, [x3], x1 +.endm + // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, // transpose into a horizontal 16x8 slice and store. // x0 = dst (temp buffer) -// x1 = unused +// x1 = slice offset // x2 = src -// x3 = slice offset +// x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon - mov x9, #32 + mov x14, x30 + movi v2.8h, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 load_clear \i, x2, x9 .endr - \txfm\()16 + bl \txfm\()16 // Do two 8x8 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two @@ -616,14 +782,14 @@ function \txfm\()16_1d_8x16_pass1_neon transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 // Store the transposed 8x8 blocks horizontally. - cmp x3, #8 + cmp x1, #8 b.eq 1f .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 store \i, x0, #16 .endr - ret + br x14 1: - // Special case: For the last input column (x3 == 8), + // Special case: For the last input column (x1 == 8), // which would be stored as the last row in the temp buffer, // don't store the first 8x8 block, but keep it in registers // for the first slice of the second pass (where it is the @@ -640,7 +806,7 @@ function \txfm\()16_1d_8x16_pass1_neon mov v29.16b, v21.16b mov v30.16b, v22.16b mov v31.16b, v23.16b - ret + br x14 endfunc // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, @@ -649,8 +815,9 @@ endfunc // x1 = dst stride // x2 = src (temp buffer) // x3 = slice offset +// x9 = temp buffer stride function \txfm\()16_1d_8x16_pass2_neon - mov x9, #32 + mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 load \i, x2, x9 .endr @@ -662,57 +829,12 @@ function \txfm\()16_1d_8x16_pass2_neon add x3, x0, x1 lsl x1, x1, #1 - \txfm\()16 + bl \txfm\()16 -.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 - srshr \coef0, \coef0, #6 - ld1 {v2.8b}, [x0], x1 - srshr \coef1, \coef1, #6 - ld1 {v3.8b}, [x3], x1 - srshr \coef2, \coef2, #6 - ld1 {v4.8b}, [x0], x1 - srshr \coef3, \coef3, #6 - uaddw \coef0, \coef0, v2.8b - ld1 {v5.8b}, [x3], x1 - uaddw \coef1, \coef1, v3.8b - srshr \coef4, \coef4, #6 - ld1 {v6.8b}, [x0], x1 - srshr \coef5, \coef5, #6 - ld1 {v7.8b}, [x3], x1 - sqxtun v2.8b, \coef0 - srshr \coef6, \coef6, #6 - sqxtun v3.8b, \coef1 - srshr \coef7, \coef7, #6 - uaddw \coef2, \coef2, v4.8b - ld1 {\tmp1}, [x0], x1 - uaddw \coef3, \coef3, v5.8b - ld1 {\tmp2}, [x3], x1 - sqxtun v4.8b, \coef2 - sub x0, x0, x1, lsl #2 - sub x3, x3, x1, lsl #2 - sqxtun v5.8b, \coef3 - uaddw \coef4, \coef4, v6.8b - st1 {v2.8b}, [x0], x1 - uaddw \coef5, \coef5, v7.8b - st1 {v3.8b}, [x3], x1 - sqxtun v6.8b, \coef4 - st1 {v4.8b}, [x0], x1 - sqxtun v7.8b, \coef5 - st1 {v5.8b}, [x3], x1 - uaddw \coef6, \coef6, \tmp1 - st1 {v6.8b}, [x0], x1 - uaddw \coef7, \coef7, \tmp2 - st1 {v7.8b}, [x3], x1 - sqxtun \tmp1, \coef6 - sqxtun \tmp2, \coef7 - st1 {\tmp1}, [x0], x1 - st1 {\tmp2}, [x3], x1 -.endm load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b -.purgem load_add_store - ret + br x14 endfunc .endm @@ -722,7 +844,7 @@ itxfm16_1d_funcs iadst .macro itxfm_func16x16 txfm1, txfm2 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct - cmp x3, #1 + cmp w3, #1 b.eq idct16x16_dc_add_neon .endif mov x15, x30 @@ -747,16 +869,47 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1,idct ld1 {v0.8h,v1.8h}, [x10] .endif + mov x9, #32 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #10 + b.le idct16x16_quarter_add_neon + cmp w3, #38 + b.le idct16x16_half_add_neon +.endif .irp i, 0, 8 add x0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i == 8 + cmp w3, #38 + b.le 1f +.endif +.endif + mov x1, #\i add x2, x6, #(\i*2) - mov x3, #\i bl \txfm1\()16_1d_8x16_pass1_neon .endr .ifc \txfm1\()_\txfm2,iadst_idct ld1 {v0.8h,v1.8h}, [x10] .endif + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + // Set v24-v31 to zero, for the in-register passthrough of + // coefficients to pass 2. Since we only do two slices, this can + // only ever happen for the second slice. So we only need to store + // zeros to the temp buffer for the second half of the buffer. + // Move x0 to the second half, and use x9 == 32 as increment. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + movi_v \i, .16b, #0 + st1 {v24.8h}, [x0], x9 +.endr +3: +.endif + .irp i, 0, 8 add x0, x4, #(\i) mov x1, x5 @@ -781,6 +934,116 @@ itxfm_func16x16 iadst, idct itxfm_func16x16 idct, iadst itxfm_func16x16 iadst, iadst +function idct16_1d_8x16_pass1_quarter_neon + mov x14, x30 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr + + bl idct16_quarter + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + // The first 8x8 block is kept in registers for the second pass, + // store the rest in the temp buffer. + // Since only a 4x4 part of the input was nonzero, this means that + // only 4 rows are nonzero after transposing, and the second pass + // only reads the topmost 4 rows. Therefore only store the topmost + // 4 rows. + add x0, x0, #16 +.irp i, 24, 25, 26, 27 + store \i, x0, x9 +.endr + br x14 +endfunc + +function idct16_1d_8x16_pass2_quarter_neon + mov x14, x30 + cbz x3, 1f +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_quarter + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + br x14 +endfunc + +function idct16_1d_8x16_pass1_half_neon + mov x14, x30 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr + + bl idct16_half + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + // The first 8x8 block is kept in registers for the second pass, + // store the rest in the temp buffer. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x0, x9 +.endr + br x14 +endfunc + +function idct16_1d_8x16_pass2_half_neon + mov x14, x30 + cbz x3, 1f +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_half + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + br x14 +endfunc + +.macro idct16_partial size +function idct16x16_\size\()_add_neon + add x0, sp, #(0*32) + add x2, x6, #(0*2) + bl idct16_1d_8x16_pass1_\size\()_neon +.irp i, 0, 8 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + mov x3, #\i + bl idct16_1d_8x16_pass2_\size\()_neon +.endr + + add sp, sp, #512 + br x15 +endfunc +.endm + +idct16_partial quarter +idct16_partial half function idct32x32_dc_add_neon movrel x4, idct_coeffs @@ -788,7 +1051,7 @@ function idct32x32_dc_add_neon movi v1.4h, #0 - ld1r {v2.4h}, [x2] + ld1 {v2.h}[0], [x2] smull v2.4s, v2.4h, v0.h[0] rshrn v2.4h, v2.4s, #14 smull v2.4s, v2.4h, v0.h[0] @@ -798,53 +1061,37 @@ function idct32x32_dc_add_neon srshr v0.8h, v2.8h, #6 + mov x3, x0 mov x4, #32 1: // Loop to add the constant v0 into all 32x32 outputs - ld1 {v1.16b,v2.16b}, [x0] - uaddw v3.8h, v0.8h, v1.8b - uaddw2 v4.8h, v0.8h, v1.16b - uaddw v5.8h, v0.8h, v2.8b - uaddw2 v6.8h, v0.8h, v2.16b - sqxtun v3.8b, v3.8h - sqxtun2 v3.16b, v4.8h - sqxtun v4.8b, v5.8h - sqxtun2 v4.16b, v6.8h - st1 {v3.16b,v4.16b}, [x0], x1 - subs x4, x4, #1 + subs x4, x4, #2 + ld1 {v1.16b,v2.16b}, [x0], x1 + uaddw v16.8h, v0.8h, v1.8b + uaddw2 v17.8h, v0.8h, v1.16b + ld1 {v3.16b,v4.16b}, [x0], x1 + uaddw v18.8h, v0.8h, v2.8b + uaddw2 v19.8h, v0.8h, v2.16b + uaddw v20.8h, v0.8h, v3.8b + uaddw2 v21.8h, v0.8h, v3.16b + uaddw v22.8h, v0.8h, v4.8b + uaddw2 v23.8h, v0.8h, v4.16b + sqxtun v1.8b, v16.8h + sqxtun2 v1.16b, v17.8h + sqxtun v2.8b, v18.8h + sqxtun2 v2.16b, v19.8h + sqxtun v3.8b, v20.8h + sqxtun2 v3.16b, v21.8h + st1 {v1.16b,v2.16b}, [x3], x1 + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + st1 {v3.16b,v4.16b}, [x3], x1 b.ne 1b ret endfunc -.macro idct32_odd - ld1 {v0.8h,v1.8h}, [x11] - - dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a - dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a - dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a - dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a - dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a - dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a - dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a - dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a - - ld1 {v0.8h}, [x10] - - butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 - butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 - butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 - butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 - butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 - butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 - butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 - butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 - - dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a - dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a - dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a - dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a - +.macro idct32_end butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a @@ -854,10 +1101,10 @@ endfunc butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 - dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a - dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28 - dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 - dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a @@ -872,8 +1119,106 @@ endfunc dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a + ret .endm +function idct32_odd + dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_half + dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + dsmull_h v4, v5, v16, v8.h[0] + dsmull_h v28, v29, v19, v8.h[7] + dsmull_h v30, v31, v16, v8.h[1] + dsmull_h v22, v23, v17, v9.h[6] + dsmull_h v7, v6, v17, v9.h[7] + dsmull_h v26, v27, v19, v8.h[6] + dsmull_h v20, v21, v18, v9.h[0] + dsmull_h v24, v25, v18, v9.h[1] + + neg v28.4s, v28.4s + neg v29.4s, v29.4s + neg v7.4s, v7.4s + neg v6.4s, v6.4s + + drshrn_h v4, v4, v5, #14 + drshrn_h v5, v28, v29, #14 + drshrn_h v29, v30, v31, #14 + drshrn_h v28, v22, v23, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v31, v26, v27, #14 + drshrn_h v6, v20, v21, #14 + drshrn_h v30, v24, v25, #14 + + dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5] + drshrn_h v23, v16, v17, #14 + drshrn_h v24, v18, v19, #14 + neg v20.4s, v20.4s + neg v21.4s, v21.4s + drshrn_h v27, v27, v26, #14 + drshrn_h v20, v20, v21, #14 + dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7] + drshrn_h v21, v16, v17, #14 + drshrn_h v26, v18, v19, #14 + dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7] + drshrn_h v25, v16, v17, #14 + neg v18.4s, v18.4s + neg v19.4s, v19.4s + drshrn_h v22, v18, v19, #14 + + idct32_end +endfunc + +.macro idct32_funcs suffix // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. // The 32-point IDCT can be decomposed into two 16-point IDCTs; // a normal IDCT16 with every other input component (the even ones, with @@ -882,22 +1227,29 @@ endfunc // x0 = dst (temp buffer) // x1 = unused // x2 = src -// x10 = idct_coeffs -// x11 = idct_coeffs + 32 -function idct32_1d_8x32_pass1_neon - ld1 {v0.8h,v1.8h}, [x10] - - // Double stride of the input, since we only read every other line - mov x9, #128 - movi v4.8h, #0 +// x9 = double input stride +function idct32_1d_8x32_pass1\suffix\()_neon + mov x14, x30 + movi v2.8h, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x2] - st1 {v4.8h}, [x2], x9 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 .endr +.endif - idct16 + bl idct16\suffix // Do two 8x8 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v23 and v24-v31 contain the @@ -910,75 +1262,94 @@ function idct32_1d_8x32_pass1_neon .macro store_rev a, b // There's no rev128 instruction, but we reverse each 64 bit // half, and then flip them using an ext with 8 bytes offset. - rev64 v1.8h, v\b\().8h - st1 {v\a\().8h}, [x0], #16 - rev64 v0.8h, v\a\().8h - ext v1.16b, v1.16b, v1.16b, #8 - st1 {v\b\().8h}, [x0], #16 - ext v0.16b, v0.16b, v0.16b, #8 - st1 {v1.8h}, [x0], #16 - st1 {v0.8h}, [x0], #16 + rev64 v3.8h, \b + st1 {\a}, [x0], #16 + rev64 v2.8h, \a + ext v3.16b, v3.16b, v3.16b, #8 + st1 {\b}, [x0], #16 + ext v2.16b, v2.16b, v2.16b, #8 + st1 {v3.8h}, [x0], #16 + st1 {v2.8h}, [x0], #16 .endm - store_rev 16, 24 - store_rev 17, 25 - store_rev 18, 26 - store_rev 19, 27 - store_rev 20, 28 - store_rev 21, 29 - store_rev 22, 30 - store_rev 23, 31 + store_rev v16.8h, v24.8h + store_rev v17.8h, v25.8h + store_rev v18.8h, v26.8h + store_rev v19.8h, v27.8h + store_rev v20.8h, v28.8h + store_rev v21.8h, v29.8h + store_rev v22.8h, v30.8h + store_rev v23.8h, v31.8h sub x0, x0, #512 .purgem store_rev // Move x2 back to the start of the input, and move // to the first odd row +.ifb \suffix sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half + sub x2, x2, x9, lsl #3 +.endif add x2, x2, #64 - movi v4.8h, #0 + movi v2.8h, #0 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x2] - st1 {v4.8h}, [x2], x9 + load_clear \i, x2, x9 .endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif - idct32_odd + bl idct32_odd\suffix - transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 - transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 + transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 + transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 // Store the registers a, b horizontally, // adding into the output first, and the mirrored, // subtracted from the output. .macro store_rev a, b ld1 {v4.8h}, [x0] - rev64 v1.8h, v\b\().8h - add v4.8h, v4.8h, v\a\().8h - rev64 v0.8h, v\a\().8h + rev64 v3.8h, \b + add v4.8h, v4.8h, \a + rev64 v2.8h, \a st1 {v4.8h}, [x0], #16 - ext v1.16b, v1.16b, v1.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 ld1 {v5.8h}, [x0] - ext v0.16b, v0.16b, v0.16b, #8 - add v5.8h, v5.8h, v\b\().8h + ext v2.16b, v2.16b, v2.16b, #8 + add v5.8h, v5.8h, \b st1 {v5.8h}, [x0], #16 ld1 {v6.8h}, [x0] - sub v6.8h, v6.8h, v1.8h + sub v6.8h, v6.8h, v3.8h st1 {v6.8h}, [x0], #16 ld1 {v7.8h}, [x0] - sub v7.8h, v7.8h, v0.8h + sub v7.8h, v7.8h, v2.8h st1 {v7.8h}, [x0], #16 .endm - store_rev 31, 23 - store_rev 30, 22 - store_rev 29, 21 - store_rev 28, 20 - store_rev 27, 19 - store_rev 26, 18 - store_rev 25, 17 - store_rev 24, 16 + store_rev v31.8h, v23.8h + store_rev v30.8h, v22.8h + store_rev v29.8h, v21.8h + store_rev v28.8h, v20.8h + store_rev v27.8h, v19.8h + store_rev v26.8h, v18.8h + store_rev v25.8h, v17.8h + store_rev v24.8h, v16.8h .purgem store_rev - ret + br x14 endfunc // This is mostly the same as 8x32_pass1, but without the transpose, @@ -987,65 +1358,91 @@ endfunc // x0 = dst // x1 = dst stride // x2 = src (temp buffer) -// x10 = idct_coeffs -// x11 = idct_coeffs + 32 -function idct32_1d_8x32_pass2_neon - ld1 {v0.8h,v1.8h}, [x10] - - mov x9, #128 +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride +function idct32_1d_8x32_pass2\suffix\()_neon + mov x14, x30 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x2], x9 + load \i, x2, x9 .endr sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif - idct16 + bl idct16\suffix - mov x9, #128 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - st1 {v\i\().8h}, [x2], x9 + store \i, x2, x9 .endr sub x2, x2, x9, lsl #4 add x2, x2, #64 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x2], x9 + load \i, x2, x9 .endr sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif sub x2, x2, #64 - idct32_odd + bl idct32_odd\suffix - mov x9, #128 .macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 ld1 {v4.8h}, [x2], x9 ld1 {v5.8h}, [x2], x9 -.if \neg == 0 - add v4.8h, v4.8h, v\a\().8h + add v4.8h, v4.8h, \a ld1 {v6.8h}, [x2], x9 - add v5.8h, v5.8h, v\b\().8h + add v5.8h, v5.8h, \b ld1 {v7.8h}, [x2], x9 - add v6.8h, v6.8h, v\c\().8h - add v7.8h, v7.8h, v\d\().8h + add v6.8h, v6.8h, \c + add v7.8h, v7.8h, \d .else - sub v4.8h, v4.8h, v\a\().8h - ld1 {v6.8h}, [x2], x9 - sub v5.8h, v5.8h, v\b\().8h - ld1 {v7.8h}, [x2], x9 - sub v6.8h, v6.8h, v\c\().8h - sub v7.8h, v7.8h, v\d\().8h + ld1 {v4.8h}, [x2], x7 + ld1 {v5.8h}, [x2], x7 + sub v4.8h, v4.8h, \a + ld1 {v6.8h}, [x2], x7 + sub v5.8h, v5.8h, \b + ld1 {v7.8h}, [x2], x7 + sub v6.8h, v6.8h, \c + sub v7.8h, v7.8h, \d .endif - ld1 {v0.8b}, [x0], x1 - ld1 {v1.8b}, [x0], x1 + ld1 {v10.8b}, [x0], x1 + ld1 {v11.8b}, [x0], x1 srshr v4.8h, v4.8h, #6 ld1 {v2.8b}, [x0], x1 srshr v5.8h, v5.8h, #6 - uaddw v4.8h, v4.8h, v0.8b + uaddw v4.8h, v4.8h, v10.8b ld1 {v3.8b}, [x0], x1 srshr v6.8h, v6.8h, #6 - uaddw v5.8h, v5.8h, v1.8b + uaddw v5.8h, v5.8h, v11.8b srshr v7.8h, v7.8h, #6 sub x0, x0, x1, lsl #2 uaddw v6.8h, v6.8h, v2.8b @@ -1059,31 +1456,37 @@ function idct32_1d_8x32_pass2_neon st1 {v6.8b}, [x0], x1 st1 {v7.8b}, [x0], x1 .endm - load_acc_store 31, 30, 29, 28 - load_acc_store 27, 26, 25, 24 - load_acc_store 23, 22, 21, 20 - load_acc_store 19, 18, 17, 16 + load_acc_store v31.8h, v30.8h, v29.8h, v28.8h + load_acc_store v27.8h, v26.8h, v25.8h, v24.8h + load_acc_store v23.8h, v22.8h, v21.8h, v20.8h + load_acc_store v19.8h, v18.8h, v17.8h, v16.8h sub x2, x2, x9 - neg x9, x9 - load_acc_store 16, 17, 18, 19, 1 - load_acc_store 20, 21, 22, 23, 1 - load_acc_store 24, 25, 26, 27, 1 - load_acc_store 28, 29, 30, 31, 1 + load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1 + load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1 + load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1 + load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1 .purgem load_acc_store - ret + br x14 endfunc +.endm + +idct32_funcs +idct32_funcs _quarter +idct32_funcs _half + +const min_eob_idct_idct_32, align=4 + .short 0, 34, 135, 336 +endconst function ff_vp9_idct_idct_32x32_add_neon, export=1 - cmp x3, #1 + cmp w3, #1 b.eq idct32x32_dc_add_neon movrel x10, idct_coeffs - add x11, x10, #32 + movrel x12, min_eob_idct_idct_32, 2 mov x15, x30 - stp d14, d15, [sp, #-0x10]! - stp d12, d13, [sp, #-0x10]! stp d10, d11, [sp, #-0x10]! stp d8, d9, [sp, #-0x10]! @@ -1093,11 +1496,44 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 mov x5, x1 mov x6, x2 + // Double stride of the input, since we only read every other line + mov x9, #128 + neg x7, x9 + + ld1 {v0.8h,v1.8h}, [x10], #32 + ld1 {v8.8h,v9.8h}, [x10] + + cmp w3, #34 + b.le idct32x32_quarter_add_neon + cmp w3, #135 + b.le idct32x32_half_add_neon + .irp i, 0, 8, 16, 24 add x0, sp, #(\i*64) +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(32 - \i)/4 + b.le 1f +.endif add x2, x6, #(\i*2) bl idct32_1d_8x32_pass1_neon .endr + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v16.8h-v19.8h}, [x0], #64 +.endr + b.ne 2b +3: .irp i, 0, 8, 16, 24 add x0, x4, #(\i) mov x1, x5 @@ -1109,8 +1545,35 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 ldp d8, d9, [sp], 0x10 ldp d10, d11, [sp], 0x10 - ldp d12, d13, [sp], 0x10 - ldp d14, d15, [sp], 0x10 br x15 endfunc + +.macro idct32_partial size +function idct32x32_\size\()_add_neon + add x0, sp, #(0*64) + add x2, x6, #(0*2) + bl idct32_1d_8x32_pass1_\size\()_neon +.ifc \size,half + add x0, sp, #(8*64) + add x2, x6, #(8*2) + bl idct32_1d_8x32_pass1_\size\()_neon +.endif +.irp i, 0, 8, 16, 24 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + bl idct32_1d_8x32_pass2_\size\()_neon +.endr + + add sp, sp, #2048 + + ldp d8, d9, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + + br x15 +endfunc +.endm + +idct32_partial quarter +idct32_partial half