]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264idct_neon.S
avutil: remove deprecated AVClass.child_class_next
[ffmpeg] / libavcodec / aarch64 / h264idct_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24
25 function ff_h264_idct_add_neon, export=1
26 .L_ff_h264_idct_add_neon:
27         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
28         sxtw            x2,     w2
29         movi            v30.8H, #0
30
31         add             v4.4H,  v0.4H,  v2.4H
32         sshr            v16.4H, v1.4H,  #1
33         st1             {v30.8H},    [x1], #16
34         sshr            v17.4H, v3.4H,  #1
35         st1             {v30.8H},    [x1], #16
36         sub             v5.4H,  v0.4H,  v2.4H
37         sub             v6.4H,  v16.4H, v3.4H
38         add             v7.4H,  v1.4H,  v17.4H
39         add             v0.4H,  v4.4H,  v7.4H
40         add             v1.4H,  v5.4H,  v6.4H
41         sub             v2.4H,  v5.4H,  v6.4H
42         sub             v3.4H,  v4.4H,  v7.4H
43
44         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
45
46         add             v4.4H,  v0.4H,  v2.4H
47         ld1             {v18.S}[0], [x0], x2
48         sshr            v16.4H,  v3.4H,  #1
49         sshr            v17.4H,  v1.4H,  #1
50         ld1             {v18.S}[1], [x0], x2
51         sub             v5.4H,  v0.4H,  v2.4H
52         ld1             {v19.S}[1], [x0], x2
53         add             v6.4H,  v16.4H, v1.4H
54         ins             v4.D[1],  v5.D[0]
55         sub             v7.4H,  v17.4H, v3.4H
56         ld1             {v19.S}[0], [x0], x2
57         ins             v6.D[1],  v7.D[0]
58         sub             x0,  x0,  x2, lsl #2
59         add             v0.8H,  v4.8H,  v6.8H
60         sub             v1.8H,  v4.8H,  v6.8H
61
62         srshr           v0.8H,  v0.8H,  #6
63         srshr           v1.8H,  v1.8H,  #6
64
65         uaddw           v0.8H,  v0.8H,  v18.8B
66         uaddw           v1.8H,  v1.8H,  v19.8B
67
68         sqxtun          v0.8B, v0.8H
69         sqxtun          v1.8B, v1.8H
70
71         st1             {v0.S}[0],  [x0], x2
72         st1             {v0.S}[1],  [x0], x2
73         st1             {v1.S}[1],  [x0], x2
74         st1             {v1.S}[0],  [x0], x2
75
76         sub             x1,  x1,  #32
77         ret
78 endfunc
79
80 function ff_h264_idct_dc_add_neon, export=1
81 .L_ff_h264_idct_dc_add_neon:
82         sxtw            x2,  w2
83         mov             w3,       #0
84         ld1r            {v2.8H},  [x1]
85         strh            w3,       [x1]
86         srshr           v2.8H,  v2.8H,  #6
87         ld1             {v0.S}[0],  [x0], x2
88         ld1             {v0.S}[1],  [x0], x2
89         uaddw           v3.8H,  v2.8H,  v0.8B
90         ld1             {v1.S}[0],  [x0], x2
91         ld1             {v1.S}[1],  [x0], x2
92         uaddw           v4.8H,  v2.8H,  v1.8B
93         sqxtun          v0.8B,  v3.8H
94         sqxtun          v1.8B,  v4.8H
95         sub             x0,  x0,  x2, lsl #2
96         st1             {v0.S}[0],  [x0], x2
97         st1             {v0.S}[1],  [x0], x2
98         st1             {v1.S}[0],  [x0], x2
99         st1             {v1.S}[1],  [x0], x2
100         ret
101 endfunc
102
103 function ff_h264_idct_add16_neon, export=1
104         mov             x12, x30
105         mov             x6,  x0         // dest
106         mov             x5,  x1         // block_offset
107         mov             x1,  x2         // block
108         mov             w9,  w3         // stride
109         movrel          x7,  scan8
110         mov             x10, #16
111         movrel          x13, .L_ff_h264_idct_dc_add_neon
112         movrel          x14, .L_ff_h264_idct_add_neon
113 1:      mov             w2,  w9
114         ldrb            w3,  [x7], #1
115         ldrsw           x0,  [x5], #4
116         ldrb            w3,  [x4,  w3,  uxtw]
117         subs            w3,  w3,  #1
118         b.lt            2f
119         ldrsh           w3,  [x1]
120         add             x0,  x0,  x6
121         ccmp            w3,  #0,  #4,  eq
122         csel            x15, x13, x14, ne
123         blr             x15
124 2:      subs            x10, x10, #1
125         add             x1,  x1,  #32
126         b.ne            1b
127         ret             x12
128 endfunc
129
130 function ff_h264_idct_add16intra_neon, export=1
131         mov             x12, x30
132         mov             x6,  x0         // dest
133         mov             x5,  x1         // block_offset
134         mov             x1,  x2         // block
135         mov             w9,  w3         // stride
136         movrel          x7,  scan8
137         mov             x10, #16
138         movrel          x13, .L_ff_h264_idct_dc_add_neon
139         movrel          x14, .L_ff_h264_idct_add_neon
140 1:      mov             w2,  w9
141         ldrb            w3,  [x7], #1
142         ldrsw           x0,  [x5], #4
143         ldrb            w3,  [x4,  w3,  uxtw]
144         add             x0,  x0,  x6
145         cmp             w3,  #0
146         ldrsh           w3,  [x1]
147         csel            x15, x13, x14, eq
148         ccmp            w3,  #0,  #0,  eq
149         b.eq            2f
150         blr             x15
151 2:      subs            x10, x10, #1
152         add             x1,  x1,  #32
153         b.ne            1b
154         ret             x12
155 endfunc
156
157 function ff_h264_idct_add8_neon, export=1
158         sub             sp,  sp, #0x40
159         stp             x19, x20, [sp]
160         mov             x12, x30
161         ldp             x6,  x15, [x0]          // dest[0], dest[1]
162         add             x5,  x1,  #16*4         // block_offset
163         add             x9,  x2,  #16*32        // block
164         mov             w19, w3                 // stride
165         movrel          x13, .L_ff_h264_idct_dc_add_neon
166         movrel          x14, .L_ff_h264_idct_add_neon
167         movrel          x7,  scan8, 16
168         mov             x10, #0
169         mov             x11, #16
170 1:      mov             w2,  w19
171         ldrb            w3,  [x7, x10]          // scan8[i]
172         ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
173         ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
174         add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
175         add             x1,  x9,  x10, lsl #5   // block + i * 16
176         cmp             w3,  #0
177         ldrsh           w3,  [x1]               // block[i*16]
178         csel            x20, x13, x14, eq
179         ccmp            w3,  #0,  #0,  eq
180         b.eq            2f
181         blr             x20
182 2:      add             x10, x10, #1
183         cmp             x10, #4
184         csel            x10, x11, x10, eq     // mov x10, #16
185         csel            x6,  x15, x6,  eq
186         cmp             x10, #20
187         b.lt            1b
188         ldp             x19, x20, [sp]
189         add             sp,  sp,  #0x40
190         ret             x12
191 endfunc
192
193 .macro  idct8x8_cols    pass
194   .if \pass == 0
195         va      .req    v18
196         vb      .req    v30
197         sshr            v18.8H, v26.8H, #1
198         add             v16.8H, v24.8H, v28.8H
199         ld1             {v30.8H, v31.8H}, [x1]
200         st1             {v19.8H}, [x1],  #16
201         st1             {v19.8H}, [x1],  #16
202         sub             v17.8H,  v24.8H, v28.8H
203         sshr            v19.8H,  v30.8H, #1
204         sub             v18.8H,  v18.8H,  v30.8H
205         add             v19.8H,  v19.8H,  v26.8H
206   .else
207         va      .req    v30
208         vb      .req    v18
209         sshr            v30.8H, v26.8H, #1
210         sshr            v19.8H, v18.8H, #1
211         add             v16.8H, v24.8H, v28.8H
212         sub             v17.8H, v24.8H, v28.8H
213         sub             v30.8H, v30.8H, v18.8H
214         add             v19.8H, v19.8H, v26.8H
215   .endif
216         add             v26.8H, v17.8H, va.8H
217         sub             v28.8H, v17.8H, va.8H
218         add             v24.8H, v16.8H, v19.8H
219         sub             vb.8H,  v16.8H, v19.8H
220         sub             v16.8H, v29.8H, v27.8H
221         add             v17.8H, v31.8H, v25.8H
222         sub             va.8H,  v31.8H, v25.8H
223         add             v19.8H, v29.8H, v27.8H
224         sub             v16.8H, v16.8H, v31.8H
225         sub             v17.8H, v17.8H, v27.8H
226         add             va.8H,  va.8H,  v29.8H
227         add             v19.8H, v19.8H, v25.8H
228         sshr            v25.8H, v25.8H, #1
229         sshr            v27.8H, v27.8H, #1
230         sshr            v29.8H, v29.8H, #1
231         sshr            v31.8H, v31.8H, #1
232         sub             v16.8H, v16.8H, v31.8H
233         sub             v17.8H, v17.8H, v27.8H
234         add             va.8H,  va.8H,  v29.8H
235         add             v19.8H, v19.8H, v25.8H
236         sshr            v25.8H, v16.8H, #2
237         sshr            v27.8H, v17.8H, #2
238         sshr            v29.8H, va.8H,  #2
239         sshr            v31.8H, v19.8H, #2
240         sub             v19.8H, v19.8H, v25.8H
241         sub             va.8H,  v27.8H, va.8H
242         add             v17.8H, v17.8H, v29.8H
243         add             v16.8H, v16.8H, v31.8H
244   .if \pass == 0
245         sub             v31.8H, v24.8H, v19.8H
246         add             v24.8H, v24.8H, v19.8H
247         add             v25.8H, v26.8H, v18.8H
248         sub             v18.8H, v26.8H, v18.8H
249         add             v26.8H, v28.8H, v17.8H
250         add             v27.8H, v30.8H, v16.8H
251         sub             v29.8H, v28.8H, v17.8H
252         sub             v28.8H, v30.8H, v16.8H
253   .else
254         sub             v31.8H, v24.8H, v19.8H
255         add             v24.8H, v24.8H, v19.8H
256         add             v25.8H, v26.8H, v30.8H
257         sub             v30.8H, v26.8H, v30.8H
258         add             v26.8H, v28.8H, v17.8H
259         sub             v29.8H, v28.8H, v17.8H
260         add             v27.8H, v18.8H, v16.8H
261         sub             v28.8H, v18.8H, v16.8H
262   .endif
263         .unreq          va
264         .unreq          vb
265 .endm
266
267 function ff_h264_idct8_add_neon, export=1
268 .L_ff_h264_idct8_add_neon:
269         movi            v19.8H,   #0
270         sxtw            x2,       w2
271         ld1             {v24.8H, v25.8H}, [x1]
272         st1             {v19.8H},  [x1],   #16
273         st1             {v19.8H},  [x1],   #16
274         ld1             {v26.8H, v27.8H}, [x1]
275         st1             {v19.8H},  [x1],   #16
276         st1             {v19.8H},  [x1],   #16
277         ld1             {v28.8H, v29.8H}, [x1]
278         st1             {v19.8H},  [x1],   #16
279         st1             {v19.8H},  [x1],   #16
280
281         idct8x8_cols    0
282         transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
283         idct8x8_cols    1
284
285         mov             x3,  x0
286         srshr           v24.8H, v24.8H, #6
287         ld1             {v0.8B},     [x0], x2
288         srshr           v25.8H, v25.8H, #6
289         ld1             {v1.8B},     [x0], x2
290         srshr           v26.8H, v26.8H, #6
291         ld1             {v2.8B},     [x0], x2
292         srshr           v27.8H, v27.8H, #6
293         ld1             {v3.8B},     [x0], x2
294         srshr           v28.8H, v28.8H, #6
295         ld1             {v4.8B},     [x0], x2
296         srshr           v29.8H, v29.8H, #6
297         ld1             {v5.8B},     [x0], x2
298         srshr           v30.8H, v30.8H, #6
299         ld1             {v6.8B},     [x0], x2
300         srshr           v31.8H, v31.8H, #6
301         ld1             {v7.8B},     [x0], x2
302         uaddw           v24.8H, v24.8H, v0.8B
303         uaddw           v25.8H, v25.8H, v1.8B
304         uaddw           v26.8H, v26.8H, v2.8B
305         sqxtun          v0.8B,  v24.8H
306         uaddw           v27.8H, v27.8H, v3.8B
307         sqxtun          v1.8B,  v25.8H
308         uaddw           v28.8H, v28.8H, v4.8B
309         sqxtun          v2.8B,  v26.8H
310         st1             {v0.8B},     [x3], x2
311         uaddw           v29.8H, v29.8H, v5.8B
312         sqxtun          v3.8B,  v27.8H
313         st1             {v1.8B},     [x3], x2
314         uaddw           v30.8H, v30.8H, v6.8B
315         sqxtun          v4.8B,  v28.8H
316         st1             {v2.8B},     [x3], x2
317         uaddw           v31.8H, v31.8H, v7.8B
318         sqxtun          v5.8B,  v29.8H
319         st1             {v3.8B},     [x3], x2
320         sqxtun          v6.8B,  v30.8H
321         sqxtun          v7.8B,  v31.8H
322         st1             {v4.8B},     [x3], x2
323         st1             {v5.8B},     [x3], x2
324         st1             {v6.8B},     [x3], x2
325         st1             {v7.8B},     [x3], x2
326
327         sub             x1,  x1,  #128
328         ret
329 endfunc
330
331 function ff_h264_idct8_dc_add_neon, export=1
332 .L_ff_h264_idct8_dc_add_neon:
333         mov             w3,       #0
334         sxtw            x2,       w2
335         ld1r            {v31.8H}, [x1]
336         strh            w3,       [x1]
337         ld1             {v0.8B},  [x0], x2
338         srshr           v31.8H, v31.8H, #6
339         ld1             {v1.8B},     [x0], x2
340         ld1             {v2.8B},     [x0], x2
341         uaddw           v24.8H, v31.8H, v0.8B
342         ld1             {v3.8B},     [x0], x2
343         uaddw           v25.8H, v31.8H, v1.8B
344         ld1             {v4.8B},     [x0], x2
345         uaddw           v26.8H, v31.8H, v2.8B
346         ld1             {v5.8B},     [x0], x2
347         uaddw           v27.8H, v31.8H, v3.8B
348         ld1             {v6.8B},     [x0], x2
349         uaddw           v28.8H, v31.8H, v4.8B
350         ld1             {v7.8B},     [x0], x2
351         uaddw           v29.8H, v31.8H, v5.8B
352         uaddw           v30.8H, v31.8H, v6.8B
353         uaddw           v31.8H, v31.8H, v7.8B
354         sqxtun          v0.8B,  v24.8H
355         sqxtun          v1.8B,  v25.8H
356         sqxtun          v2.8B,  v26.8H
357         sqxtun          v3.8B,  v27.8H
358         sub             x0,  x0,  x2, lsl #3
359         st1             {v0.8B},     [x0], x2
360         sqxtun          v4.8B,  v28.8H
361         st1             {v1.8B},     [x0], x2
362         sqxtun          v5.8B,  v29.8H
363         st1             {v2.8B},     [x0], x2
364         sqxtun          v6.8B,  v30.8H
365         st1             {v3.8B},     [x0], x2
366         sqxtun          v7.8B,  v31.8H
367         st1             {v4.8B},     [x0], x2
368         st1             {v5.8B},     [x0], x2
369         st1             {v6.8B},     [x0], x2
370         st1             {v7.8B},     [x0], x2
371         ret
372 endfunc
373
374 function ff_h264_idct8_add4_neon, export=1
375         mov             x12, x30
376         mov             x6,  x0
377         mov             x5,  x1
378         mov             x1,  x2
379         mov             w2,  w3
380         movrel          x7,  scan8
381         mov             w10, #16
382         movrel          x13, .L_ff_h264_idct8_dc_add_neon
383         movrel          x14, .L_ff_h264_idct8_add_neon
384 1:      ldrb            w9,  [x7], #4
385         ldrsw           x0,  [x5], #16
386         ldrb            w9,  [x4, w9, UXTW]
387         subs            w9,  w9,  #1
388         b.lt            2f
389         ldrsh           w11,  [x1]
390         add             x0,  x6,  x0
391         ccmp            w11, #0,  #4,  eq
392         csel            x15, x13, x14, ne
393         blr             x15
394 2:      subs            w10, w10, #4
395         add             x1,  x1,  #128
396         b.ne            1b
397         ret             x12
398 endfunc
399
400 const   scan8
401         .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
402         .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
403         .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
404         .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
405         .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
406         .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
407         .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
408         .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
409         .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
410         .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
411         .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
412         .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
413 endconst