]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264idct_neon.S
Merge commit '9a3202a98b2e095b54dd784c3e01a09a676fc3fa'
[ffmpeg] / libavcodec / aarch64 / h264idct_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24
25 function ff_h264_idct_add_neon, export=1
26         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
27         sxtw            x2,     w2
28         movi            v30.8H, #0
29
30         add             v4.4H,  v0.4H,  v2.4H
31         sshr            v16.4H, v1.4H,  #1
32         st1             {v30.8H},    [x1], #16
33         sshr            v17.4H, v3.4H,  #1
34         st1             {v30.8H},    [x1], #16
35         sub             v5.4H,  v0.4H,  v2.4H
36         add             v6.4H,  v1.4H,  v17.4H
37         sub             v7.4H,  v16.4H, v3.4H
38         add             v0.4H,  v4.4H,  v6.4H
39         add             v1.4H,  v5.4H,  v7.4H
40         sub             v2.4H,  v4.4H,  v6.4H
41         sub             v3.4H,  v5.4H,  v7.4H
42
43         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
44
45         add             v4.4H,  v0.4H,  v3.4H
46         ld1             {v18.S}[0], [x0], x2
47         sshr            v16.4H,  v2.4H,  #1
48         sshr            v17.4H,  v1.4H,  #1
49         ld1             {v19.S}[1], [x0], x2
50         sub             v5.4H,  v0.4H,  v3.4H
51         ld1             {v18.S}[1], [x0], x2
52         add             v6.4H,  v16.4H, v1.4H
53         ins             v4.D[1],  v5.D[0]
54         sub             v7.4H,  v2.4H,  v17.4H
55         ld1             {v19.S}[0], [x0], x2
56         ins             v6.D[1],  v7.D[0]
57         sub             x0,  x0,  x2, lsl #2
58         add             v0.8H,  v4.8H,  v6.8H
59         sub             v1.8H,  v4.8H,  v6.8H
60
61         srshr           v0.8H,  v0.8H,  #6
62         srshr           v1.8H,  v1.8H,  #6
63
64         uaddw           v0.8H,  v0.8H,  v18.8B
65         uaddw           v1.8H,  v1.8H,  v19.8B
66
67         sqxtun          v0.8B, v0.8H
68         sqxtun          v1.8B, v1.8H
69
70         st1             {v0.S}[0],  [x0], x2
71         st1             {v1.S}[1],  [x0], x2
72         st1             {v0.S}[1],  [x0], x2
73         st1             {v1.S}[0],  [x0], x2
74
75         sub             x1,  x1,  #32
76         ret
77 endfunc
78
79 function ff_h264_idct_dc_add_neon, export=1
80         sxtw            x2,  w2
81         mov             w3,       #0
82         ld1r            {v2.8H},  [x1]
83         strh            w3,       [x1]
84         srshr           v2.8H,  v2.8H,  #6
85         ld1             {v0.S}[0],  [x0], x2
86         ld1             {v0.S}[1],  [x0], x2
87         uaddw           v3.8H,  v2.8H,  v0.8B
88         ld1             {v1.S}[0],  [x0], x2
89         ld1             {v1.S}[1],  [x0], x2
90         uaddw           v4.8H,  v2.8H,  v1.8B
91         sqxtun          v0.8B,  v3.8H
92         sqxtun          v1.8B,  v4.8H
93         sub             x0,  x0,  x2, lsl #2
94         st1             {v0.S}[0],  [x0], x2
95         st1             {v0.S}[1],  [x0], x2
96         st1             {v1.S}[0],  [x0], x2
97         st1             {v1.S}[1],  [x0], x2
98         ret
99 endfunc
100
101 function ff_h264_idct_add16_neon, export=1
102         mov             x12, x30
103         mov             x6,  x0         // dest
104         mov             x5,  x1         // block_offset
105         mov             x1,  x2         // block
106         mov             w9,  w3         // stride
107         movrel          x7,  scan8
108         mov             x10, #16
109         movrel          x13, X(ff_h264_idct_dc_add_neon)
110         movrel          x14, X(ff_h264_idct_add_neon)
111 1:      mov             w2,  w9
112         ldrb            w3,  [x7], #1
113         ldrsw           x0,  [x5], #4
114         ldrb            w3,  [x4,  w3,  uxtw]
115         subs            w3,  w3,  #1
116         b.lt            2f
117         ldrsh           w3,  [x1]
118         add             x0,  x0,  x6
119         ccmp            w3,  #0,  #4,  eq
120         csel            x15, x13, x14, ne
121         blr             x15
122 2:      subs            x10, x10, #1
123         add             x1,  x1,  #32
124         b.ne            1b
125         ret             x12
126 endfunc
127
128 function ff_h264_idct_add16intra_neon, export=1
129         mov             x12, x30
130         mov             x6,  x0         // dest
131         mov             x5,  x1         // block_offset
132         mov             x1,  x2         // block
133         mov             w9,  w3         // stride
134         movrel          x7,  scan8
135         mov             x10, #16
136         movrel          x13, X(ff_h264_idct_dc_add_neon)
137         movrel          x14, X(ff_h264_idct_add_neon)
138 1:      mov             w2,  w9
139         ldrb            w3,  [x7], #1
140         ldrsw           x0,  [x5], #4
141         ldrb            w3,  [x4,  w3,  uxtw]
142         add             x0,  x0,  x6
143         cmp             w3,  #0
144         ldrsh           w3,  [x1]
145         csel            x15, x13, x14, eq
146         ccmp            w3,  #0,  #0,  eq
147         b.eq            2f
148         blr             x15
149 2:      subs            x10, x10, #1
150         add             x1,  x1,  #32
151         b.ne            1b
152         ret             x12
153 endfunc
154
155 function ff_h264_idct_add8_neon, export=1
156         sub             sp,  sp, #0x40
157         stp             x19, x20, [sp]
158         mov             x12, x30
159         ldp             x6,  x15, [x0]          // dest[0], dest[1]
160         add             x5,  x1,  #16*4         // block_offset
161         add             x9,  x2,  #16*32        // block
162         mov             w19, w3                 // stride
163         movrel          x13, X(ff_h264_idct_dc_add_neon)
164         movrel          x14, X(ff_h264_idct_add_neon)
165         movrel          x7,  scan8+16
166         mov             x10, #0
167         mov             x11, #16
168 1:      mov             w2,  w19
169         ldrb            w3,  [x7, x10]          // scan8[i]
170         ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
171         ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
172         add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
173         add             x1,  x9,  x10, lsl #5   // block + i * 16
174         cmp             w3,  #0
175         ldrsh           w3,  [x1]               // block[i*16]
176         csel            x20, x13, x14, eq
177         ccmp            w3,  #0,  #0,  eq
178         b.eq            2f
179         blr             x20
180 2:      add             x10, x10, #1
181         cmp             x10, #4
182         csel            x10, x11, x10, eq     // mov x10, #16
183         csel            x6,  x15, x6,  eq
184         cmp             x10, #20
185         b.lt            1b
186         ldp             x19, x20, [sp]
187         add             sp,  sp,  #0x40
188         ret             x12
189 endfunc
190
191 .macro  idct8x8_cols    pass
192   .if \pass == 0
193         va      .req    v18
194         vb      .req    v30
195         sshr            v18.8H, v26.8H, #1
196         add             v16.8H, v24.8H, v28.8H
197         ld1             {v30.8H, v31.8H}, [x1]
198         st1             {v19.8H}, [x1],  #16
199         st1             {v19.8H}, [x1],  #16
200         sub             v17.8H,  v24.8H, v28.8H
201         sshr            v19.8H,  v30.8H, #1
202         sub             v18.8H,  v18.8H,  v30.8H
203         add             v19.8H,  v19.8H,  v26.8H
204   .else
205         va      .req    v30
206         vb      .req    v18
207         sshr            v30.8H, v26.8H, #1
208         sshr            v19.8H, v18.8H, #1
209         add             v16.8H, v24.8H, v28.8H
210         sub             v17.8H, v24.8H, v28.8H
211         sub             v30.8H, v30.8H, v18.8H
212         add             v19.8H, v19.8H, v26.8H
213   .endif
214         add             v26.8H, v17.8H, va.8H
215         sub             v28.8H, v17.8H, va.8H
216         add             v24.8H, v16.8H, v19.8H
217         sub             vb.8H,  v16.8H, v19.8H
218         sub             v16.8H, v29.8H, v27.8H
219         add             v17.8H, v31.8H, v25.8H
220         sub             va.8H,  v31.8H, v25.8H
221         add             v19.8H, v29.8H, v27.8H
222         sub             v16.8H, v16.8H, v31.8H
223         sub             v17.8H, v17.8H, v27.8H
224         add             va.8H,  va.8H,  v29.8H
225         add             v19.8H, v19.8H, v25.8H
226         sshr            v25.8H, v25.8H, #1
227         sshr            v27.8H, v27.8H, #1
228         sshr            v29.8H, v29.8H, #1
229         sshr            v31.8H, v31.8H, #1
230         sub             v16.8H, v16.8H, v31.8H
231         sub             v17.8H, v17.8H, v27.8H
232         add             va.8H,  va.8H,  v29.8H
233         add             v19.8H, v19.8H, v25.8H
234         sshr            v25.8H, v16.8H, #2
235         sshr            v27.8H, v17.8H, #2
236         sshr            v29.8H, va.8H,  #2
237         sshr            v31.8H, v19.8H, #2
238         sub             v19.8H, v19.8H, v25.8H
239         sub             va.8H,  v27.8H, va.8H
240         add             v17.8H, v17.8H, v29.8H
241         add             v16.8H, v16.8H, v31.8H
242   .if \pass == 0
243         sub             v31.8H, v24.8H, v19.8H
244         add             v24.8H, v24.8H, v19.8H
245         add             v25.8H, v26.8H, v18.8H
246         sub             v18.8H, v26.8H, v18.8H
247         add             v26.8H, v28.8H, v17.8H
248         add             v27.8H, v30.8H, v16.8H
249         sub             v29.8H, v28.8H, v17.8H
250         sub             v28.8H, v30.8H, v16.8H
251   .else
252         sub             v31.8H, v24.8H, v19.8H
253         add             v24.8H, v24.8H, v19.8H
254         add             v25.8H, v26.8H, v30.8H
255         sub             v30.8H, v26.8H, v30.8H
256         add             v26.8H, v28.8H, v17.8H
257         sub             v29.8H, v28.8H, v17.8H
258         add             v27.8H, v18.8H, v16.8H
259         sub             v28.8H, v18.8H, v16.8H
260   .endif
261         .unreq          va
262         .unreq          vb
263 .endm
264
265 function ff_h264_idct8_add_neon, export=1
266         movi            v19.8H,   #0
267         ld1             {v24.8H, v25.8H}, [x1]
268         st1             {v19.8H},  [x1],   #16
269         st1             {v19.8H},  [x1],   #16
270         ld1             {v26.8H, v27.8H}, [x1]
271         st1             {v19.8H},  [x1],   #16
272         st1             {v19.8H},  [x1],   #16
273         ld1             {v28.8H, v29.8H}, [x1]
274         st1             {v19.8H},  [x1],   #16
275         st1             {v19.8H},  [x1],   #16
276
277         idct8x8_cols    0
278         transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
279         idct8x8_cols    1
280
281         mov             x3,  x0
282         srshr           v24.8H, v24.8H, #6
283         ld1             {v0.8B},     [x0], x2
284         srshr           v25.8H, v25.8H, #6
285         ld1             {v1.8B},     [x0], x2
286         srshr           v26.8H, v26.8H, #6
287         ld1             {v2.8B},     [x0], x2
288         srshr           v27.8H, v27.8H, #6
289         ld1             {v3.8B},     [x0], x2
290         srshr           v28.8H, v28.8H, #6
291         ld1             {v4.8B},     [x0], x2
292         srshr           v29.8H, v29.8H, #6
293         ld1             {v5.8B},     [x0], x2
294         srshr           v30.8H, v30.8H, #6
295         ld1             {v6.8B},     [x0], x2
296         srshr           v31.8H, v31.8H, #6
297         ld1             {v7.8B},     [x0], x2
298         uaddw           v24.8H, v24.8H, v0.8B
299         uaddw           v25.8H, v25.8H, v1.8B
300         uaddw           v26.8H, v26.8H, v2.8B
301         sqxtun          v0.8B,  v24.8H
302         uaddw           v27.8H, v27.8H, v3.8B
303         sqxtun          v1.8B,  v25.8H
304         uaddw           v28.8H, v28.8H, v4.8B
305         sqxtun          v2.8B,  v26.8H
306         st1             {v0.8B},     [x3], x2
307         uaddw           v29.8H, v29.8H, v5.8B
308         sqxtun          v3.8B,  v27.8H
309         st1             {v1.8B},     [x3], x2
310         uaddw           v30.8H, v30.8H, v6.8B
311         sqxtun          v4.8B,  v28.8H
312         st1             {v2.8B},     [x3], x2
313         uaddw           v31.8H, v31.8H, v7.8B
314         sqxtun          v5.8B,  v29.8H
315         st1             {v3.8B},     [x3], x2
316         sqxtun          v6.8B,  v30.8H
317         sqxtun          v7.8B,  v31.8H
318         st1             {v4.8B},     [x3], x2
319         st1             {v5.8B},     [x3], x2
320         st1             {v6.8B},     [x3], x2
321         st1             {v7.8B},     [x3], x2
322
323         sub             x1,  x1,  #128
324         ret
325 endfunc
326
327 function ff_h264_idct8_dc_add_neon, export=1
328         mov             w3,       #0
329         sxtw            x2,       w2
330         ld1r            {v31.8H}, [x1]
331         strh            w3,       [x1]
332         ld1             {v0.8B},  [x0], x2
333         srshr           v31.8H, v31.8H, #6
334         ld1             {v1.8B},     [x0], x2
335         ld1             {v2.8B},     [x0], x2
336         uaddw           v24.8H, v31.8H, v0.8B
337         ld1             {v3.8B},     [x0], x2
338         uaddw           v25.8H, v31.8H, v1.8B
339         ld1             {v4.8B},     [x0], x2
340         uaddw           v26.8H, v31.8H, v2.8B
341         ld1             {v5.8B},     [x0], x2
342         uaddw           v27.8H, v31.8H, v3.8B
343         ld1             {v6.8B},     [x0], x2
344         uaddw           v28.8H, v31.8H, v4.8B
345         ld1             {v7.8B},     [x0], x2
346         uaddw           v29.8H, v31.8H, v5.8B
347         uaddw           v30.8H, v31.8H, v6.8B
348         uaddw           v31.8H, v31.8H, v7.8B
349         sqxtun          v0.8B,  v24.8H
350         sqxtun          v1.8B,  v25.8H
351         sqxtun          v2.8B,  v26.8H
352         sqxtun          v3.8B,  v27.8H
353         sub             x0,  x0,  x2, lsl #3
354         st1             {v0.8B},     [x0], x2
355         sqxtun          v4.8B,  v28.8H
356         st1             {v1.8B},     [x0], x2
357         sqxtun          v5.8B,  v29.8H
358         st1             {v2.8B},     [x0], x2
359         sqxtun          v6.8B,  v30.8H
360         st1             {v3.8B},     [x0], x2
361         sqxtun          v7.8B,  v31.8H
362         st1             {v4.8B},     [x0], x2
363         st1             {v5.8B},     [x0], x2
364         st1             {v6.8B},     [x0], x2
365         st1             {v7.8B},     [x0], x2
366         ret
367 endfunc
368
369 function ff_h264_idct8_add4_neon, export=1
370         mov             x12, x30
371         mov             x6,  x0
372         mov             x5,  x1
373         mov             x1,  x2
374         mov             w2,  w3
375         movrel          x7,  scan8
376         mov             w10, #16
377         movrel          x13, X(ff_h264_idct8_dc_add_neon)
378         movrel          x14, X(ff_h264_idct8_add_neon)
379 1:      ldrb            w9,  [x7], #4
380         ldrsw           x0,  [x5], #16
381         ldrb            w9,  [x4, w9, UXTW]
382         subs            w9,  w9,  #1
383         b.lt            2f
384         ldrsh           w11,  [x1]
385         add             x0,  x6,  x0
386         ccmp            w11, #0,  #4,  eq
387         csel            x15, x13, x14, ne
388         blr             x15
389 2:      subs            w10, w10, #4
390         add             x1,  x1,  #128
391         b.ne            1b
392         ret             x12
393 endfunc
394
395 const   scan8
396         .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
397         .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
398         .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
399         .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
400         .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
401         .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
402         .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
403         .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
404         .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
405         .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
406         .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
407         .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
408 endconst