]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264idct_neon.S
Merge commit '78149d6657302b58d5e46e8bc0a521ed009f86f7'
[ffmpeg] / libavcodec / aarch64 / h264idct_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24
25 function ff_h264_idct_add_neon, export=1
26         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
27         sxtw            x2,     w2
28         movi            v30.8H, #0
29
30         add             v4.4H,  v0.4H,  v2.4H
31         sshr            v16.4H, v1.4H,  #1
32         st1             {v30.8H},    [x1], #16
33         sshr            v17.4H, v3.4H,  #1
34         st1             {v30.8H},    [x1], #16
35         sub             v5.4H,  v0.4H,  v2.4H
36         sub             v6.4H,  v16.4H, v3.4H
37         add             v7.4H,  v1.4H,  v17.4H
38         add             v0.4H,  v4.4H,  v7.4H
39         add             v1.4H,  v5.4H,  v6.4H
40         sub             v2.4H,  v5.4H,  v6.4H
41         sub             v3.4H,  v4.4H,  v7.4H
42
43         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
44
45         add             v4.4H,  v0.4H,  v2.4H
46         ld1             {v18.S}[0], [x0], x2
47         sshr            v16.4H,  v3.4H,  #1
48         sshr            v17.4H,  v1.4H,  #1
49         ld1             {v18.S}[1], [x0], x2
50         sub             v5.4H,  v0.4H,  v2.4H
51         ld1             {v19.S}[1], [x0], x2
52         add             v6.4H,  v16.4H, v1.4H
53         ins             v4.D[1],  v5.D[0]
54         sub             v7.4H,  v17.4H, v3.4H
55         ld1             {v19.S}[0], [x0], x2
56         ins             v6.D[1],  v7.D[0]
57         sub             x0,  x0,  x2, lsl #2
58         add             v0.8H,  v4.8H,  v6.8H
59         sub             v1.8H,  v4.8H,  v6.8H
60
61         srshr           v0.8H,  v0.8H,  #6
62         srshr           v1.8H,  v1.8H,  #6
63
64         uaddw           v0.8H,  v0.8H,  v18.8B
65         uaddw           v1.8H,  v1.8H,  v19.8B
66
67         sqxtun          v0.8B, v0.8H
68         sqxtun          v1.8B, v1.8H
69
70         st1             {v0.S}[0],  [x0], x2
71         st1             {v0.S}[1],  [x0], x2
72         st1             {v1.S}[1],  [x0], x2
73         st1             {v1.S}[0],  [x0], x2
74
75         sub             x1,  x1,  #32
76         ret
77 endfunc
78
79 function ff_h264_idct_dc_add_neon, export=1
80         sxtw            x2,  w2
81         mov             w3,       #0
82         ld1r            {v2.8H},  [x1]
83         strh            w3,       [x1]
84         srshr           v2.8H,  v2.8H,  #6
85         ld1             {v0.S}[0],  [x0], x2
86         ld1             {v0.S}[1],  [x0], x2
87         uaddw           v3.8H,  v2.8H,  v0.8B
88         ld1             {v1.S}[0],  [x0], x2
89         ld1             {v1.S}[1],  [x0], x2
90         uaddw           v4.8H,  v2.8H,  v1.8B
91         sqxtun          v0.8B,  v3.8H
92         sqxtun          v1.8B,  v4.8H
93         sub             x0,  x0,  x2, lsl #2
94         st1             {v0.S}[0],  [x0], x2
95         st1             {v0.S}[1],  [x0], x2
96         st1             {v1.S}[0],  [x0], x2
97         st1             {v1.S}[1],  [x0], x2
98         ret
99 endfunc
100
101 function ff_h264_idct_add16_neon, export=1
102         mov             x12, x30
103         mov             x6,  x0         // dest
104         mov             x5,  x1         // block_offset
105         mov             x1,  x2         // block
106         mov             w9,  w3         // stride
107         movrel          x7,  scan8
108         mov             x10, #16
109         movrel          x13, X(ff_h264_idct_dc_add_neon)
110         movrel          x14, X(ff_h264_idct_add_neon)
111 1:      mov             w2,  w9
112         ldrb            w3,  [x7], #1
113         ldrsw           x0,  [x5], #4
114         ldrb            w3,  [x4,  w3,  uxtw]
115         subs            w3,  w3,  #1
116         b.lt            2f
117         ldrsh           w3,  [x1]
118         add             x0,  x0,  x6
119         ccmp            w3,  #0,  #4,  eq
120         csel            x15, x13, x14, ne
121         blr             x15
122 2:      subs            x10, x10, #1
123         add             x1,  x1,  #32
124         b.ne            1b
125         ret             x12
126 endfunc
127
128 function ff_h264_idct_add16intra_neon, export=1
129         mov             x12, x30
130         mov             x6,  x0         // dest
131         mov             x5,  x1         // block_offset
132         mov             x1,  x2         // block
133         mov             w9,  w3         // stride
134         movrel          x7,  scan8
135         mov             x10, #16
136         movrel          x13, X(ff_h264_idct_dc_add_neon)
137         movrel          x14, X(ff_h264_idct_add_neon)
138 1:      mov             w2,  w9
139         ldrb            w3,  [x7], #1
140         ldrsw           x0,  [x5], #4
141         ldrb            w3,  [x4,  w3,  uxtw]
142         add             x0,  x0,  x6
143         cmp             w3,  #0
144         ldrsh           w3,  [x1]
145         csel            x15, x13, x14, eq
146         ccmp            w3,  #0,  #0,  eq
147         b.eq            2f
148         blr             x15
149 2:      subs            x10, x10, #1
150         add             x1,  x1,  #32
151         b.ne            1b
152         ret             x12
153 endfunc
154
155 function ff_h264_idct_add8_neon, export=1
156         sub             sp,  sp, #0x40
157         stp             x19, x20, [sp]
158         mov             x12, x30
159         ldp             x6,  x15, [x0]          // dest[0], dest[1]
160         add             x5,  x1,  #16*4         // block_offset
161         add             x9,  x2,  #16*32        // block
162         mov             w19, w3                 // stride
163         movrel          x13, X(ff_h264_idct_dc_add_neon)
164         movrel          x14, X(ff_h264_idct_add_neon)
165         movrel          x7,  scan8, 16
166         mov             x10, #0
167         mov             x11, #16
168 1:      mov             w2,  w19
169         ldrb            w3,  [x7, x10]          // scan8[i]
170         ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
171         ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
172         add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
173         add             x1,  x9,  x10, lsl #5   // block + i * 16
174         cmp             w3,  #0
175         ldrsh           w3,  [x1]               // block[i*16]
176         csel            x20, x13, x14, eq
177         ccmp            w3,  #0,  #0,  eq
178         b.eq            2f
179         blr             x20
180 2:      add             x10, x10, #1
181         cmp             x10, #4
182         csel            x10, x11, x10, eq     // mov x10, #16
183         csel            x6,  x15, x6,  eq
184         cmp             x10, #20
185         b.lt            1b
186         ldp             x19, x20, [sp]
187         add             sp,  sp,  #0x40
188         ret             x12
189 endfunc
190
191 .macro  idct8x8_cols    pass
192   .if \pass == 0
193         va      .req    v18
194         vb      .req    v30
195         sshr            v18.8H, v26.8H, #1
196         add             v16.8H, v24.8H, v28.8H
197         ld1             {v30.8H, v31.8H}, [x1]
198         st1             {v19.8H}, [x1],  #16
199         st1             {v19.8H}, [x1],  #16
200         sub             v17.8H,  v24.8H, v28.8H
201         sshr            v19.8H,  v30.8H, #1
202         sub             v18.8H,  v18.8H,  v30.8H
203         add             v19.8H,  v19.8H,  v26.8H
204   .else
205         va      .req    v30
206         vb      .req    v18
207         sshr            v30.8H, v26.8H, #1
208         sshr            v19.8H, v18.8H, #1
209         add             v16.8H, v24.8H, v28.8H
210         sub             v17.8H, v24.8H, v28.8H
211         sub             v30.8H, v30.8H, v18.8H
212         add             v19.8H, v19.8H, v26.8H
213   .endif
214         add             v26.8H, v17.8H, va.8H
215         sub             v28.8H, v17.8H, va.8H
216         add             v24.8H, v16.8H, v19.8H
217         sub             vb.8H,  v16.8H, v19.8H
218         sub             v16.8H, v29.8H, v27.8H
219         add             v17.8H, v31.8H, v25.8H
220         sub             va.8H,  v31.8H, v25.8H
221         add             v19.8H, v29.8H, v27.8H
222         sub             v16.8H, v16.8H, v31.8H
223         sub             v17.8H, v17.8H, v27.8H
224         add             va.8H,  va.8H,  v29.8H
225         add             v19.8H, v19.8H, v25.8H
226         sshr            v25.8H, v25.8H, #1
227         sshr            v27.8H, v27.8H, #1
228         sshr            v29.8H, v29.8H, #1
229         sshr            v31.8H, v31.8H, #1
230         sub             v16.8H, v16.8H, v31.8H
231         sub             v17.8H, v17.8H, v27.8H
232         add             va.8H,  va.8H,  v29.8H
233         add             v19.8H, v19.8H, v25.8H
234         sshr            v25.8H, v16.8H, #2
235         sshr            v27.8H, v17.8H, #2
236         sshr            v29.8H, va.8H,  #2
237         sshr            v31.8H, v19.8H, #2
238         sub             v19.8H, v19.8H, v25.8H
239         sub             va.8H,  v27.8H, va.8H
240         add             v17.8H, v17.8H, v29.8H
241         add             v16.8H, v16.8H, v31.8H
242   .if \pass == 0
243         sub             v31.8H, v24.8H, v19.8H
244         add             v24.8H, v24.8H, v19.8H
245         add             v25.8H, v26.8H, v18.8H
246         sub             v18.8H, v26.8H, v18.8H
247         add             v26.8H, v28.8H, v17.8H
248         add             v27.8H, v30.8H, v16.8H
249         sub             v29.8H, v28.8H, v17.8H
250         sub             v28.8H, v30.8H, v16.8H
251   .else
252         sub             v31.8H, v24.8H, v19.8H
253         add             v24.8H, v24.8H, v19.8H
254         add             v25.8H, v26.8H, v30.8H
255         sub             v30.8H, v26.8H, v30.8H
256         add             v26.8H, v28.8H, v17.8H
257         sub             v29.8H, v28.8H, v17.8H
258         add             v27.8H, v18.8H, v16.8H
259         sub             v28.8H, v18.8H, v16.8H
260   .endif
261         .unreq          va
262         .unreq          vb
263 .endm
264
265 function ff_h264_idct8_add_neon, export=1
266         movi            v19.8H,   #0
267         sxtw            x2,       w2
268         ld1             {v24.8H, v25.8H}, [x1]
269         st1             {v19.8H},  [x1],   #16
270         st1             {v19.8H},  [x1],   #16
271         ld1             {v26.8H, v27.8H}, [x1]
272         st1             {v19.8H},  [x1],   #16
273         st1             {v19.8H},  [x1],   #16
274         ld1             {v28.8H, v29.8H}, [x1]
275         st1             {v19.8H},  [x1],   #16
276         st1             {v19.8H},  [x1],   #16
277
278         idct8x8_cols    0
279         transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
280         idct8x8_cols    1
281
282         mov             x3,  x0
283         srshr           v24.8H, v24.8H, #6
284         ld1             {v0.8B},     [x0], x2
285         srshr           v25.8H, v25.8H, #6
286         ld1             {v1.8B},     [x0], x2
287         srshr           v26.8H, v26.8H, #6
288         ld1             {v2.8B},     [x0], x2
289         srshr           v27.8H, v27.8H, #6
290         ld1             {v3.8B},     [x0], x2
291         srshr           v28.8H, v28.8H, #6
292         ld1             {v4.8B},     [x0], x2
293         srshr           v29.8H, v29.8H, #6
294         ld1             {v5.8B},     [x0], x2
295         srshr           v30.8H, v30.8H, #6
296         ld1             {v6.8B},     [x0], x2
297         srshr           v31.8H, v31.8H, #6
298         ld1             {v7.8B},     [x0], x2
299         uaddw           v24.8H, v24.8H, v0.8B
300         uaddw           v25.8H, v25.8H, v1.8B
301         uaddw           v26.8H, v26.8H, v2.8B
302         sqxtun          v0.8B,  v24.8H
303         uaddw           v27.8H, v27.8H, v3.8B
304         sqxtun          v1.8B,  v25.8H
305         uaddw           v28.8H, v28.8H, v4.8B
306         sqxtun          v2.8B,  v26.8H
307         st1             {v0.8B},     [x3], x2
308         uaddw           v29.8H, v29.8H, v5.8B
309         sqxtun          v3.8B,  v27.8H
310         st1             {v1.8B},     [x3], x2
311         uaddw           v30.8H, v30.8H, v6.8B
312         sqxtun          v4.8B,  v28.8H
313         st1             {v2.8B},     [x3], x2
314         uaddw           v31.8H, v31.8H, v7.8B
315         sqxtun          v5.8B,  v29.8H
316         st1             {v3.8B},     [x3], x2
317         sqxtun          v6.8B,  v30.8H
318         sqxtun          v7.8B,  v31.8H
319         st1             {v4.8B},     [x3], x2
320         st1             {v5.8B},     [x3], x2
321         st1             {v6.8B},     [x3], x2
322         st1             {v7.8B},     [x3], x2
323
324         sub             x1,  x1,  #128
325         ret
326 endfunc
327
328 function ff_h264_idct8_dc_add_neon, export=1
329         mov             w3,       #0
330         sxtw            x2,       w2
331         ld1r            {v31.8H}, [x1]
332         strh            w3,       [x1]
333         ld1             {v0.8B},  [x0], x2
334         srshr           v31.8H, v31.8H, #6
335         ld1             {v1.8B},     [x0], x2
336         ld1             {v2.8B},     [x0], x2
337         uaddw           v24.8H, v31.8H, v0.8B
338         ld1             {v3.8B},     [x0], x2
339         uaddw           v25.8H, v31.8H, v1.8B
340         ld1             {v4.8B},     [x0], x2
341         uaddw           v26.8H, v31.8H, v2.8B
342         ld1             {v5.8B},     [x0], x2
343         uaddw           v27.8H, v31.8H, v3.8B
344         ld1             {v6.8B},     [x0], x2
345         uaddw           v28.8H, v31.8H, v4.8B
346         ld1             {v7.8B},     [x0], x2
347         uaddw           v29.8H, v31.8H, v5.8B
348         uaddw           v30.8H, v31.8H, v6.8B
349         uaddw           v31.8H, v31.8H, v7.8B
350         sqxtun          v0.8B,  v24.8H
351         sqxtun          v1.8B,  v25.8H
352         sqxtun          v2.8B,  v26.8H
353         sqxtun          v3.8B,  v27.8H
354         sub             x0,  x0,  x2, lsl #3
355         st1             {v0.8B},     [x0], x2
356         sqxtun          v4.8B,  v28.8H
357         st1             {v1.8B},     [x0], x2
358         sqxtun          v5.8B,  v29.8H
359         st1             {v2.8B},     [x0], x2
360         sqxtun          v6.8B,  v30.8H
361         st1             {v3.8B},     [x0], x2
362         sqxtun          v7.8B,  v31.8H
363         st1             {v4.8B},     [x0], x2
364         st1             {v5.8B},     [x0], x2
365         st1             {v6.8B},     [x0], x2
366         st1             {v7.8B},     [x0], x2
367         ret
368 endfunc
369
370 function ff_h264_idct8_add4_neon, export=1
371         mov             x12, x30
372         mov             x6,  x0
373         mov             x5,  x1
374         mov             x1,  x2
375         mov             w2,  w3
376         movrel          x7,  scan8
377         mov             w10, #16
378         movrel          x13, X(ff_h264_idct8_dc_add_neon)
379         movrel          x14, X(ff_h264_idct8_add_neon)
380 1:      ldrb            w9,  [x7], #4
381         ldrsw           x0,  [x5], #16
382         ldrb            w9,  [x4, w9, UXTW]
383         subs            w9,  w9,  #1
384         b.lt            2f
385         ldrsh           w11,  [x1]
386         add             x0,  x6,  x0
387         ccmp            w11, #0,  #4,  eq
388         csel            x15, x13, x14, ne
389         blr             x15
390 2:      subs            w10, w10, #4
391         add             x1,  x1,  #128
392         b.ne            1b
393         ret             x12
394 endfunc
395
396 const   scan8
397         .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
398         .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
399         .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
400         .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
401         .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
402         .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
403         .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
404         .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
405         .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
406         .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
407         .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
408         .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
409 endconst