]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/dcadsp_neon.S
Merge commit '9d74012761bc3ee676fe43321d5699e4877fde5b'
[ffmpeg] / libavcodec / aarch64 / dcadsp_neon.S
1 /*
2  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23
24 function ff_dca_lfe_fir0_neon, export=1
25         mov             x3,  #32                // decifactor
26         sub             x1,  x1,  #7*4
27         add             x4,  x0,  #2*32*4 - 16  // out2
28         mov             x7,  #-16
29
30         ld1             {v0.4s,v1.4s}, [x1]
31         // reverse [-num_coeffs + 1, 0]
32         ext             v3.16b, v0.16b, v0.16b, #8
33         ext             v2.16b, v1.16b, v1.16b, #8
34         rev64           v3.4s,  v3.4s
35         rev64           v2.4s,  v2.4s
36 1:
37         ld1             {v4.4s,v5.4s}, [x2], #32
38         ld1             {v6.4s,v7.4s}, [x2], #32
39         subs            x3,  x3,  #4
40         fmul            v16.4s, v2.4s,  v4.4s
41         fmul            v23.4s, v0.4s,  v4.4s
42         fmul            v17.4s, v2.4s,  v6.4s
43         fmul            v22.4s, v0.4s,  v6.4s
44
45         fmla            v16.4s, v3.4s,  v5.4s
46         fmla            v23.4s, v1.4s,  v5.4s
47         ld1             {v4.4s,v5.4s}, [x2], #32
48         fmla            v17.4s, v3.4s,  v7.4s
49         fmla            v22.4s, v1.4s,  v7.4s
50         ld1             {v6.4s,v7.4s}, [x2], #32
51         fmul            v18.4s, v2.4s,  v4.4s
52         fmul            v21.4s, v0.4s,  v4.4s
53         fmul            v19.4s, v2.4s,  v6.4s
54         fmul            v20.4s, v0.4s,  v6.4s
55
56         fmla            v18.4s, v3.4s,  v5.4s
57         fmla            v21.4s, v1.4s,  v5.4s
58         fmla            v19.4s, v3.4s,  v7.4s
59         fmla            v20.4s, v1.4s,  v7.4s
60
61         faddp           v16.4s, v16.4s, v17.4s
62         faddp           v18.4s, v18.4s, v19.4s
63         faddp           v20.4s, v20.4s, v21.4s
64         faddp           v22.4s, v22.4s, v23.4s
65         faddp           v16.4s, v16.4s, v18.4s
66         faddp           v20.4s, v20.4s, v22.4s
67
68         st1             {v16.4s}, [x0], #16
69         st1             {v20.4s}, [x4], x7
70         b.gt            1b
71
72         ret
73 endfunc
74
75 function ff_dca_lfe_fir1_neon, export=1
76         mov             x3,  #64                // decifactor
77         sub             x1,  x1,  #3*4
78         add             x4,  x0,  #2*64*4 - 16  // out2
79         mov             x7,  #-16
80
81         ld1             {v0.4s}, [x1]
82         // reverse [-num_coeffs + 1, 0]
83         ext             v1.16b, v0.16b, v0.16b, #8
84         rev64           v1.4s,  v1.4s
85
86 1:
87         ld1             {v4.4s,v5.4s}, [x2], #32
88         ld1             {v6.4s,v7.4s}, [x2], #32
89         subs            x3,  x3,  #4
90         fmul            v16.4s, v1.4s,  v4.4s
91         fmul            v23.4s, v0.4s,  v4.4s
92         fmul            v17.4s, v1.4s,  v5.4s
93         fmul            v22.4s, v0.4s,  v5.4s
94         fmul            v18.4s, v1.4s,  v6.4s
95         fmul            v21.4s, v0.4s,  v6.4s
96         fmul            v19.4s, v1.4s,  v7.4s
97         fmul            v20.4s, v0.4s,  v7.4s
98         faddp           v16.4s, v16.4s, v17.4s
99         faddp           v18.4s, v18.4s, v19.4s
100         faddp           v20.4s, v20.4s, v21.4s
101         faddp           v22.4s, v22.4s, v23.4s
102         faddp           v16.4s, v16.4s, v18.4s
103         faddp           v20.4s, v20.4s, v22.4s
104         st1             {v16.4s}, [x0], #16
105         st1             {v20.4s}, [x4], x7
106         b.gt            1b
107
108         ret
109 endfunc