]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264dsp_neon.S
avutil: remove deprecated AVClass.child_class_next
[ffmpeg] / libavcodec / aarch64 / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/aarch64/asm.S"
24 #include "neon.S"
25
26 .macro  h264_loop_filter_start
27         cmp             w2,  #0
28         ldr             w6,  [x4]
29         ccmp            w3,  #0, #0, ne
30         mov             v24.S[0], w6
31         and             w8,  w6,  w6,  lsl #16
32         b.eq            1f
33         ands            w8,  w8,  w8,  lsl #8
34         b.ge            2f
35 1:
36         ret
37 2:
38 .endm
39
40 .macro  h264_loop_filter_luma
41         dup             v22.16B, w2                     // alpha
42         uxtl            v24.8H,  v24.8B
43         uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
44         uxtl            v24.4S,  v24.4H
45         uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
46         sli             v24.8H,  v24.8H,  #8
47         uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
48         sli             v24.4S,  v24.4S,  #16
49         cmhi            v21.16B, v22.16B, v21.16B       // < alpha
50         dup             v22.16B, w3                     // beta
51         cmlt            v23.16B, v24.16B, #0
52         cmhi            v28.16B, v22.16B, v28.16B       // < beta
53         cmhi            v30.16B, v22.16B, v30.16B       // < beta
54         bic             v21.16B, v21.16B, v23.16B
55         uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
56         and             v21.16B, v21.16B, v28.16B
57         uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
58         and             v21.16B, v21.16B, v30.16B      // < beta
59         shrn            v30.8b,  v21.8h,  #4
60         mov             x7, v30.d[0]
61         cmhi            v17.16B, v22.16B, v17.16B       // < beta
62         cmhi            v19.16B, v22.16B, v19.16B       // < beta
63         cbz             x7,  9f
64         and             v17.16B, v17.16B, v21.16B
65         and             v19.16B, v19.16B, v21.16B
66         and             v24.16B, v24.16B, v21.16B
67         urhadd          v28.16B, v16.16B,  v0.16B
68         sub             v21.16B, v24.16B, v17.16B
69         uqadd           v23.16B, v18.16B, v24.16B
70         uhadd           v20.16B, v20.16B, v28.16B
71         sub             v21.16B, v21.16B, v19.16B
72         uhadd           v28.16B,  v4.16B, v28.16B
73         umin            v23.16B, v23.16B, v20.16B
74         uqsub           v22.16B, v18.16B, v24.16B
75         uqadd           v4.16B,   v2.16B, v24.16B
76         umax            v23.16B, v23.16B, v22.16B
77         uqsub           v22.16B,  v2.16B, v24.16B
78         umin            v28.16B,  v4.16B, v28.16B
79         uxtl            v4.8H,    v0.8B
80         umax            v28.16B, v28.16B, v22.16B
81         uxtl2           v20.8H,   v0.16B
82         usubw           v4.8H,    v4.8H,  v16.8B
83         usubw2          v20.8H,  v20.8H,  v16.16B
84         shl             v4.8H,    v4.8H,  #2
85         shl             v20.8H,  v20.8H,  #2
86         uaddw           v4.8H,    v4.8H,  v18.8B
87         uaddw2          v20.8H,  v20.8H,  v18.16B
88         usubw           v4.8H,    v4.8H,   v2.8B
89         usubw2          v20.8H,  v20.8H,   v2.16B
90         rshrn           v4.8B,    v4.8H,  #3
91         rshrn2          v4.16B,  v20.8H,  #3
92         bsl             v17.16B, v23.16B, v18.16B
93         bsl             v19.16B, v28.16B,  v2.16B
94         neg             v23.16B, v21.16B
95         uxtl            v28.8H,  v16.8B
96         smin            v4.16B,   v4.16B, v21.16B
97         uxtl2           v21.8H,  v16.16B
98         smax            v4.16B,   v4.16B, v23.16B
99         uxtl            v22.8H,   v0.8B
100         uxtl2           v24.8H,   v0.16B
101         saddw           v28.8H,  v28.8H,  v4.8B
102         saddw2          v21.8H,  v21.8H,  v4.16B
103         ssubw           v22.8H,  v22.8H,  v4.8B
104         ssubw2          v24.8H,  v24.8H,  v4.16B
105         sqxtun          v16.8B,  v28.8H
106         sqxtun2         v16.16B, v21.8H
107         sqxtun          v0.8B,   v22.8H
108         sqxtun2         v0.16B,  v24.8H
109 .endm
110
111 function ff_h264_v_loop_filter_luma_neon, export=1
112         h264_loop_filter_start
113         sxtw            x1,  w1
114
115         ld1             {v0.16B},  [x0], x1
116         ld1             {v2.16B},  [x0], x1
117         ld1             {v4.16B},  [x0], x1
118         sub             x0,  x0,  x1, lsl #2
119         sub             x0,  x0,  x1, lsl #1
120         ld1             {v20.16B},  [x0], x1
121         ld1             {v18.16B},  [x0], x1
122         ld1             {v16.16B},  [x0], x1
123
124         h264_loop_filter_luma
125
126         sub             x0,  x0,  x1, lsl #1
127         st1             {v17.16B},  [x0], x1
128         st1             {v16.16B}, [x0], x1
129         st1             {v0.16B},  [x0], x1
130         st1             {v19.16B}, [x0]
131 9:
132         ret
133 endfunc
134
135 function ff_h264_h_loop_filter_luma_neon, export=1
136         h264_loop_filter_start
137         sxtw            x1,  w1
138
139         sub             x0,  x0,  #4
140         ld1             {v6.8B},  [x0], x1
141         ld1             {v20.8B}, [x0], x1
142         ld1             {v18.8B}, [x0], x1
143         ld1             {v16.8B}, [x0], x1
144         ld1             {v0.8B},  [x0], x1
145         ld1             {v2.8B},  [x0], x1
146         ld1             {v4.8B},  [x0], x1
147         ld1             {v26.8B}, [x0], x1
148         ld1             {v6.D}[1],  [x0], x1
149         ld1             {v20.D}[1], [x0], x1
150         ld1             {v18.D}[1], [x0], x1
151         ld1             {v16.D}[1], [x0], x1
152         ld1             {v0.D}[1],  [x0], x1
153         ld1             {v2.D}[1],  [x0], x1
154         ld1             {v4.D}[1],  [x0], x1
155         ld1             {v26.D}[1], [x0], x1
156
157         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
158
159         h264_loop_filter_luma
160
161         transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
162
163         sub             x0,  x0,  x1, lsl #4
164         add             x0,  x0,  #2
165         st1             {v17.S}[0],  [x0], x1
166         st1             {v16.S}[0], [x0], x1
167         st1             {v0.S}[0],  [x0], x1
168         st1             {v19.S}[0], [x0], x1
169         st1             {v17.S}[1],  [x0], x1
170         st1             {v16.S}[1], [x0], x1
171         st1             {v0.S}[1],  [x0], x1
172         st1             {v19.S}[1], [x0], x1
173         st1             {v17.S}[2],  [x0], x1
174         st1             {v16.S}[2], [x0], x1
175         st1             {v0.S}[2],  [x0], x1
176         st1             {v19.S}[2], [x0], x1
177         st1             {v17.S}[3],  [x0], x1
178         st1             {v16.S}[3], [x0], x1
179         st1             {v0.S}[3],  [x0], x1
180         st1             {v19.S}[3], [x0], x1
181 9:
182         ret
183 endfunc
184
185
186 .macro h264_loop_filter_start_intra
187     orr             w4,  w2,  w3
188     cbnz            w4,  1f
189     ret
190 1:
191     sxtw            x1,  w1
192     dup             v30.16b, w2                // alpha
193     dup             v31.16b, w3                // beta
194 .endm
195
196 .macro h264_loop_filter_luma_intra
197     uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
198     uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
199     uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
200     cmhi            v19.16b, v30.16b, v16.16b       // < alpha
201     cmhi            v17.16b, v31.16b, v17.16b       // < beta
202     cmhi            v18.16b, v31.16b, v18.16b       // < beta
203
204     movi            v29.16b, #2
205     ushr            v30.16b, v30.16b, #2            // alpha >> 2
206     add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
207     cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
208
209     and             v19.16b, v19.16b, v17.16b
210     and             v19.16b, v19.16b, v18.16b
211     shrn            v20.8b,  v19.8h,  #4
212     mov             x4, v20.d[0]
213     cbz             x4, 9f
214
215     ushll           v20.8h,  v6.8b,   #1
216     ushll           v22.8h,  v1.8b,   #1
217     ushll2          v21.8h,  v6.16b,  #1
218     ushll2          v23.8h,  v1.16b,  #1
219     uaddw           v20.8h,  v20.8h,  v7.8b
220     uaddw           v22.8h,  v22.8h,  v0.8b
221     uaddw2          v21.8h,  v21.8h,  v7.16b
222     uaddw2          v23.8h,  v23.8h,  v0.16b
223     uaddw           v20.8h,  v20.8h,  v1.8b
224     uaddw           v22.8h,  v22.8h,  v6.8b
225     uaddw2          v21.8h,  v21.8h,  v1.16b
226     uaddw2          v23.8h,  v23.8h,  v6.16b
227
228     rshrn           v24.8b,  v20.8h,  #2 // p0'_1
229     rshrn           v25.8b,  v22.8h,  #2 // q0'_1
230     rshrn2          v24.16b, v21.8h,  #2 // p0'_1
231     rshrn2          v25.16b, v23.8h,  #2 // q0'_1
232
233     uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
234     uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
235     cmhi            v17.16b, v31.16b, v17.16b       // < beta
236     cmhi            v18.16b, v31.16b, v18.16b       // < beta
237
238     and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
239     and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
240
241     not             v30.16b, v17.16b
242     not             v31.16b, v18.16b
243
244     and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
245     and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
246
247     and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
248     and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
249
250     //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
251     uaddl           v26.8h,  v5.8b,   v7.8b
252     uaddl2          v27.8h,  v5.16b,  v7.16b
253     uaddw           v26.8h,  v26.8h,  v0.8b
254     uaddw2          v27.8h,  v27.8h,  v0.16b
255     add             v20.8h,  v20.8h,  v26.8h
256     add             v21.8h,  v21.8h,  v27.8h
257     uaddw           v20.8h,  v20.8h,  v0.8b
258     uaddw2          v21.8h,  v21.8h,  v0.16b
259     rshrn           v20.8b,  v20.8h,  #3 // p0'_2
260     rshrn2          v20.16b, v21.8h,  #3 // p0'_2
261     uaddw           v26.8h,  v26.8h,  v6.8b
262     uaddw2          v27.8h,  v27.8h,  v6.16b
263     rshrn           v21.8b,  v26.8h,  #2 // p1'_2
264     rshrn2          v21.16b, v27.8h,  #2 // p1'_2
265     uaddl           v28.8h,  v4.8b,   v5.8b
266     uaddl2          v29.8h,  v4.16b,  v5.16b
267     shl             v28.8h,  v28.8h,  #1
268     shl             v29.8h,  v29.8h,  #1
269     add             v28.8h,  v28.8h,  v26.8h
270     add             v29.8h,  v29.8h,  v27.8h
271     rshrn           v19.8b,  v28.8h,  #3 // p2'_2
272     rshrn2          v19.16b, v29.8h,  #3 // p2'_2
273
274     //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
275     uaddl           v26.8h,  v2.8b,   v0.8b
276     uaddl2          v27.8h,  v2.16b,  v0.16b
277     uaddw           v26.8h,  v26.8h,  v7.8b
278     uaddw2          v27.8h,  v27.8h,  v7.16b
279     add             v22.8h,  v22.8h,  v26.8h
280     add             v23.8h,  v23.8h,  v27.8h
281     uaddw           v22.8h,  v22.8h,  v7.8b
282     uaddw2          v23.8h,  v23.8h,  v7.16b
283     rshrn           v22.8b,  v22.8h,  #3 // q0'_2
284     rshrn2          v22.16b, v23.8h,  #3 // q0'_2
285     uaddw           v26.8h,  v26.8h,  v1.8b
286     uaddw2          v27.8h,  v27.8h,  v1.16b
287     rshrn           v23.8b,  v26.8h,  #2 // q1'_2
288     rshrn2          v23.16b, v27.8h,  #2 // q1'_2
289     uaddl           v28.8h,  v2.8b,   v3.8b
290     uaddl2          v29.8h,  v2.16b,  v3.16b
291     shl             v28.8h,  v28.8h,  #1
292     shl             v29.8h,  v29.8h,  #1
293     add             v28.8h,  v28.8h,  v26.8h
294     add             v29.8h,  v29.8h,  v27.8h
295     rshrn           v26.8b,  v28.8h,  #3 // q2'_2
296     rshrn2          v26.16b, v29.8h,  #3 // q2'_2
297
298     bit             v7.16b,  v24.16b, v30.16b  // p0'_1
299     bit             v0.16b,  v25.16b, v31.16b  // q0'_1
300     bit             v7.16b, v20.16b,  v17.16b  // p0'_2
301     bit             v6.16b, v21.16b,  v17.16b  // p1'_2
302     bit             v5.16b, v19.16b,  v17.16b  // p2'_2
303     bit             v0.16b, v22.16b,  v18.16b  // q0'_2
304     bit             v1.16b, v23.16b,  v18.16b  // q1'_2
305     bit             v2.16b, v26.16b,  v18.16b  // q2'_2
306 .endm
307
308 function ff_h264_v_loop_filter_luma_intra_neon, export=1
309     h264_loop_filter_start_intra
310
311     ld1             {v0.16b},  [x0], x1 // q0
312     ld1             {v1.16b},  [x0], x1 // q1
313     ld1             {v2.16b},  [x0], x1 // q2
314     ld1             {v3.16b},  [x0], x1 // q3
315     sub             x0,  x0,  x1, lsl #3
316     ld1             {v4.16b},  [x0], x1 // p3
317     ld1             {v5.16b},  [x0], x1 // p2
318     ld1             {v6.16b},  [x0], x1 // p1
319     ld1             {v7.16b},  [x0]     // p0
320
321     h264_loop_filter_luma_intra
322
323     sub             x0,  x0,  x1, lsl #1
324     st1             {v5.16b}, [x0], x1  // p2
325     st1             {v6.16b}, [x0], x1  // p1
326     st1             {v7.16b}, [x0], x1  // p0
327     st1             {v0.16b}, [x0], x1  // q0
328     st1             {v1.16b}, [x0], x1  // q1
329     st1             {v2.16b}, [x0]      // q2
330 9:
331     ret
332 endfunc
333
334 function ff_h264_h_loop_filter_luma_intra_neon, export=1
335     h264_loop_filter_start_intra
336
337     sub             x0,  x0,  #4
338     ld1             {v4.8b},  [x0], x1
339     ld1             {v5.8b},  [x0], x1
340     ld1             {v6.8b},  [x0], x1
341     ld1             {v7.8b},  [x0], x1
342     ld1             {v0.8b},  [x0], x1
343     ld1             {v1.8b},  [x0], x1
344     ld1             {v2.8b},  [x0], x1
345     ld1             {v3.8b},  [x0], x1
346     ld1             {v4.d}[1],  [x0], x1
347     ld1             {v5.d}[1],  [x0], x1
348     ld1             {v6.d}[1],  [x0], x1
349     ld1             {v7.d}[1],  [x0], x1
350     ld1             {v0.d}[1],  [x0], x1
351     ld1             {v1.d}[1],  [x0], x1
352     ld1             {v2.d}[1],  [x0], x1
353     ld1             {v3.d}[1],  [x0], x1
354
355     transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
356
357     h264_loop_filter_luma_intra
358
359     transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
360
361     sub             x0,  x0,  x1, lsl #4
362     st1             {v4.8b},  [x0], x1
363     st1             {v5.8b},  [x0], x1
364     st1             {v6.8b},  [x0], x1
365     st1             {v7.8b},  [x0], x1
366     st1             {v0.8b},  [x0], x1
367     st1             {v1.8b},  [x0], x1
368     st1             {v2.8b},  [x0], x1
369     st1             {v3.8b},  [x0], x1
370     st1             {v4.d}[1],  [x0], x1
371     st1             {v5.d}[1],  [x0], x1
372     st1             {v6.d}[1],  [x0], x1
373     st1             {v7.d}[1],  [x0], x1
374     st1             {v0.d}[1],  [x0], x1
375     st1             {v1.d}[1],  [x0], x1
376     st1             {v2.d}[1],  [x0], x1
377     st1             {v3.d}[1],  [x0], x1
378 9:
379     ret
380 endfunc
381
382 .macro  h264_loop_filter_chroma
383         dup             v22.8B, w2              // alpha
384         dup             v23.8B, w3              // beta
385         uxtl            v24.8H, v24.8B
386         uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
387         uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
388         uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
389         cmhi            v26.8B, v22.8B, v26.8B  // < alpha
390         cmhi            v28.8B, v23.8B, v28.8B  // < beta
391         cmhi            v30.8B, v23.8B, v30.8B  // < beta
392         uxtl            v4.8H,  v0.8B
393         and             v26.8B, v26.8B, v28.8B
394         usubw           v4.8H,  v4.8H,  v16.8B
395         and             v26.8B, v26.8B, v30.8B
396         shl             v4.8H,  v4.8H,  #2
397         mov             x8,  v26.d[0]
398         sli             v24.8H, v24.8H, #8
399         uaddw           v4.8H,  v4.8H,  v18.8B
400         cbz             x8,  9f
401         usubw           v4.8H,  v4.8H,  v2.8B
402         rshrn           v4.8B,  v4.8H,  #3
403         smin            v4.8B,  v4.8B,  v24.8B
404         neg             v25.8B, v24.8B
405         smax            v4.8B,  v4.8B,  v25.8B
406         uxtl            v22.8H, v0.8B
407         and             v4.8B,  v4.8B,  v26.8B
408         uxtl            v28.8H, v16.8B
409         saddw           v28.8H, v28.8H, v4.8B
410         ssubw           v22.8H, v22.8H, v4.8B
411         sqxtun          v16.8B, v28.8H
412         sqxtun          v0.8B,  v22.8H
413 .endm
414
415 function ff_h264_v_loop_filter_chroma_neon, export=1
416         h264_loop_filter_start
417         sxtw            x1,  w1
418
419         sub             x0,  x0,  x1, lsl #1
420         ld1             {v18.8B}, [x0], x1
421         ld1             {v16.8B}, [x0], x1
422         ld1             {v0.8B},  [x0], x1
423         ld1             {v2.8B},  [x0]
424
425         h264_loop_filter_chroma
426
427         sub             x0,  x0,  x1, lsl #1
428         st1             {v16.8B}, [x0], x1
429         st1             {v0.8B},  [x0], x1
430 9:
431         ret
432 endfunc
433
434 function ff_h264_h_loop_filter_chroma_neon, export=1
435         h264_loop_filter_start
436         sxtw            x1,  w1
437
438         sub             x0,  x0,  #2
439 h_loop_filter_chroma420:
440         ld1             {v18.S}[0], [x0], x1
441         ld1             {v16.S}[0], [x0], x1
442         ld1             {v0.S}[0],  [x0], x1
443         ld1             {v2.S}[0],  [x0], x1
444         ld1             {v18.S}[1], [x0], x1
445         ld1             {v16.S}[1], [x0], x1
446         ld1             {v0.S}[1],  [x0], x1
447         ld1             {v2.S}[1],  [x0], x1
448
449         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
450
451         h264_loop_filter_chroma
452
453         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
454
455         sub             x0,  x0,  x1, lsl #3
456         st1             {v18.S}[0], [x0], x1
457         st1             {v16.S}[0], [x0], x1
458         st1             {v0.S}[0],  [x0], x1
459         st1             {v2.S}[0],  [x0], x1
460         st1             {v18.S}[1], [x0], x1
461         st1             {v16.S}[1], [x0], x1
462         st1             {v0.S}[1],  [x0], x1
463         st1             {v2.S}[1],  [x0], x1
464 9:
465         ret
466 endfunc
467
468 function ff_h264_h_loop_filter_chroma422_neon, export=1
469         sxtw            x1,  w1
470         h264_loop_filter_start
471         add             x5,  x0,  x1
472         sub             x0,  x0,  #2
473         add             x1,  x1,  x1
474         mov             x7,  x30
475         bl              h_loop_filter_chroma420
476         mov             x30, x7
477         sub             x0,  x5,  #2
478         mov             v24.s[0], w6
479         b               h_loop_filter_chroma420
480 endfunc
481
482 .macro h264_loop_filter_chroma_intra
483     uabd            v26.8b, v16.8b, v17.8b  // abs(p0 - q0)
484     uabd            v27.8b, v18.8b, v16.8b  // abs(p1 - p0)
485     uabd            v28.8b, v19.8b, v17.8b  // abs(q1 - q0)
486     cmhi            v26.8b, v30.8b, v26.8b  // < alpha
487     cmhi            v27.8b, v31.8b, v27.8b  // < beta
488     cmhi            v28.8b, v31.8b, v28.8b  // < beta
489     and             v26.8b, v26.8b, v27.8b
490     and             v26.8b, v26.8b, v28.8b
491     mov             x2, v26.d[0]
492
493     ushll           v4.8h,   v18.8b,  #1
494     ushll           v6.8h,   v19.8b,  #1
495     cbz             x2, 9f
496     uaddl           v20.8h,  v16.8b,  v19.8b
497     uaddl           v22.8h,  v17.8b,  v18.8b
498     add             v20.8h,  v20.8h,  v4.8h
499     add             v22.8h,  v22.8h,  v6.8h
500     uqrshrn         v24.8b,  v20.8h,  #2
501     uqrshrn         v25.8b,  v22.8h,  #2
502     bit             v16.8b, v24.8b, v26.8b
503     bit             v17.8b, v25.8b, v26.8b
504 .endm
505
506 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
507     h264_loop_filter_start_intra
508
509     sub             x0,  x0,  x1, lsl #1
510     ld1             {v18.8b}, [x0], x1
511     ld1             {v16.8b}, [x0], x1
512     ld1             {v17.8b}, [x0], x1
513     ld1             {v19.8b}, [x0]
514
515     h264_loop_filter_chroma_intra
516
517     sub             x0,  x0,  x1, lsl #1
518     st1             {v16.8b}, [x0], x1
519     st1             {v17.8b}, [x0], x1
520
521 9:
522     ret
523 endfunc
524
525 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
526     h264_loop_filter_start_intra
527
528     sub             x4,  x0,  #2
529     sub             x0,  x0,  #1
530     ld1             {v18.8b}, [x4], x1
531     ld1             {v16.8b}, [x4], x1
532     ld1             {v17.8b}, [x4], x1
533     ld1             {v19.8b}, [x4], x1
534
535     transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
536
537     h264_loop_filter_chroma_intra
538
539     st2             {v16.b,v17.b}[0], [x0], x1
540     st2             {v16.b,v17.b}[1], [x0], x1
541     st2             {v16.b,v17.b}[2], [x0], x1
542     st2             {v16.b,v17.b}[3], [x0], x1
543
544 9:
545     ret
546 endfunc
547
548 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
549     h264_loop_filter_start_intra
550
551     sub             x4,  x0,  #2
552     sub             x0,  x0,  #1
553 h_loop_filter_chroma420_intra:
554     ld1             {v18.8b}, [x4], x1
555     ld1             {v16.8b}, [x4], x1
556     ld1             {v17.8b}, [x4], x1
557     ld1             {v19.8b}, [x4], x1
558     ld1             {v18.s}[1], [x4], x1
559     ld1             {v16.s}[1], [x4], x1
560     ld1             {v17.s}[1], [x4], x1
561     ld1             {v19.s}[1], [x4], x1
562
563     transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
564
565     h264_loop_filter_chroma_intra
566
567     st2             {v16.b,v17.b}[0], [x0], x1
568     st2             {v16.b,v17.b}[1], [x0], x1
569     st2             {v16.b,v17.b}[2], [x0], x1
570     st2             {v16.b,v17.b}[3], [x0], x1
571     st2             {v16.b,v17.b}[4], [x0], x1
572     st2             {v16.b,v17.b}[5], [x0], x1
573     st2             {v16.b,v17.b}[6], [x0], x1
574     st2             {v16.b,v17.b}[7], [x0], x1
575
576 9:
577     ret
578 endfunc
579
580 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
581     h264_loop_filter_start_intra
582     sub             x4,  x0,  #2
583     add             x5,  x0,  x1, lsl #3
584     sub             x0,  x0,  #1
585     mov             x7,  x30
586     bl              h_loop_filter_chroma420_intra
587     sub             x0,  x5,  #1
588     mov             x30, x7
589     b               h_loop_filter_chroma420_intra
590 endfunc
591
592 .macro  biweight_16     macs, macd
593         dup             v0.16B,  w5
594         dup             v1.16B,  w6
595         mov             v4.16B,  v16.16B
596         mov             v6.16B,  v16.16B
597 1:      subs            w3,  w3,  #2
598         ld1             {v20.16B}, [x0], x2
599         \macd           v4.8H,   v0.8B,  v20.8B
600         \macd\()2       v6.8H,   v0.16B, v20.16B
601         ld1             {v22.16B}, [x1], x2
602         \macs           v4.8H,   v1.8B,  v22.8B
603         \macs\()2       v6.8H,   v1.16B, v22.16B
604         mov             v24.16B, v16.16B
605         ld1             {v28.16B}, [x0], x2
606         mov             v26.16B, v16.16B
607         \macd           v24.8H,  v0.8B,  v28.8B
608         \macd\()2       v26.8H,  v0.16B, v28.16B
609         ld1             {v30.16B}, [x1], x2
610         \macs           v24.8H,  v1.8B,  v30.8B
611         \macs\()2       v26.8H,  v1.16B, v30.16B
612         sshl            v4.8H,   v4.8H,  v18.8H
613         sshl            v6.8H,   v6.8H,  v18.8H
614         sqxtun          v4.8B,   v4.8H
615         sqxtun2         v4.16B,  v6.8H
616         sshl            v24.8H,  v24.8H, v18.8H
617         sshl            v26.8H,  v26.8H, v18.8H
618         sqxtun          v24.8B,  v24.8H
619         sqxtun2         v24.16B, v26.8H
620         mov             v6.16B,  v16.16B
621         st1             {v4.16B},  [x7], x2
622         mov             v4.16B,  v16.16B
623         st1             {v24.16B}, [x7], x2
624         b.ne            1b
625         ret
626 .endm
627
628 .macro  biweight_8      macs, macd
629         dup             v0.8B,  w5
630         dup             v1.8B,  w6
631         mov             v2.16B,  v16.16B
632         mov             v20.16B, v16.16B
633 1:      subs            w3,  w3,  #2
634         ld1             {v4.8B}, [x0], x2
635         \macd           v2.8H,  v0.8B,  v4.8B
636         ld1             {v5.8B}, [x1], x2
637         \macs           v2.8H,  v1.8B,  v5.8B
638         ld1             {v6.8B}, [x0], x2
639         \macd           v20.8H, v0.8B,  v6.8B
640         ld1             {v7.8B}, [x1], x2
641         \macs           v20.8H, v1.8B,  v7.8B
642         sshl            v2.8H,  v2.8H,  v18.8H
643         sqxtun          v2.8B,  v2.8H
644         sshl            v20.8H, v20.8H, v18.8H
645         sqxtun          v4.8B,  v20.8H
646         mov             v20.16B, v16.16B
647         st1             {v2.8B}, [x7], x2
648         mov             v2.16B,  v16.16B
649         st1             {v4.8B}, [x7], x2
650         b.ne            1b
651         ret
652 .endm
653
654 .macro  biweight_4      macs, macd
655         dup             v0.8B,  w5
656         dup             v1.8B,  w6
657         mov             v2.16B, v16.16B
658         mov             v20.16B,v16.16B
659 1:      subs            w3,  w3,  #4
660         ld1             {v4.S}[0], [x0], x2
661         ld1             {v4.S}[1], [x0], x2
662         \macd           v2.8H,  v0.8B,  v4.8B
663         ld1             {v5.S}[0], [x1], x2
664         ld1             {v5.S}[1], [x1], x2
665         \macs           v2.8H,  v1.8B,  v5.8B
666         b.lt            2f
667         ld1             {v6.S}[0], [x0], x2
668         ld1             {v6.S}[1], [x0], x2
669         \macd           v20.8H, v0.8B,  v6.8B
670         ld1             {v7.S}[0], [x1], x2
671         ld1             {v7.S}[1], [x1], x2
672         \macs           v20.8H, v1.8B,  v7.8B
673         sshl            v2.8H,  v2.8H,  v18.8H
674         sqxtun          v2.8B,  v2.8H
675         sshl            v20.8H, v20.8H, v18.8H
676         sqxtun          v4.8B,  v20.8H
677         mov             v20.16B, v16.16B
678         st1             {v2.S}[0], [x7], x2
679         st1             {v2.S}[1], [x7], x2
680         mov             v2.16B,  v16.16B
681         st1             {v4.S}[0], [x7], x2
682         st1             {v4.S}[1], [x7], x2
683         b.ne            1b
684         ret
685 2:      sshl            v2.8H,  v2.8H,  v18.8H
686         sqxtun          v2.8B,  v2.8H
687         st1             {v2.S}[0], [x7], x2
688         st1             {v2.S}[1], [x7], x2
689         ret
690 .endm
691
692 .macro  biweight_func   w
693 function ff_biweight_h264_pixels_\w\()_neon, export=1
694         sxtw            x2,  w2
695         lsr             w8,  w5,  #31
696         add             w7,  w7,  #1
697         eor             w8,  w8,  w6,  lsr #30
698         orr             w7,  w7,  #1
699         dup             v18.8H,   w4
700         lsl             w7,  w7,  w4
701         not             v18.16B,  v18.16B
702         dup             v16.8H,   w7
703         mov             x7,  x0
704         cbz             w8,  10f
705         subs            w8,  w8,  #1
706         b.eq            20f
707         subs            w8,  w8,  #1
708         b.eq            30f
709         b               40f
710 10:     biweight_\w     umlal, umlal
711 20:     neg             w5, w5
712         biweight_\w     umlal, umlsl
713 30:     neg             w5, w5
714         neg             w6, w6
715         biweight_\w     umlsl, umlsl
716 40:     neg             w6, w6
717         biweight_\w     umlsl, umlal
718 endfunc
719 .endm
720
721         biweight_func   16
722         biweight_func   8
723         biweight_func   4
724
725 .macro  weight_16       add
726         dup             v0.16B,  w4
727 1:      subs            w2,  w2,  #2
728         ld1             {v20.16B}, [x0], x1
729         umull           v4.8H,   v0.8B,  v20.8B
730         umull2          v6.8H,   v0.16B, v20.16B
731         ld1             {v28.16B}, [x0], x1
732         umull           v24.8H,  v0.8B,  v28.8B
733         umull2          v26.8H,  v0.16B, v28.16B
734         \add            v4.8H,   v16.8H, v4.8H
735         srshl           v4.8H,   v4.8H,  v18.8H
736         \add            v6.8H,   v16.8H, v6.8H
737         srshl           v6.8H,   v6.8H,  v18.8H
738         sqxtun          v4.8B,   v4.8H
739         sqxtun2         v4.16B,  v6.8H
740         \add            v24.8H,  v16.8H, v24.8H
741         srshl           v24.8H,  v24.8H, v18.8H
742         \add            v26.8H,  v16.8H, v26.8H
743         srshl           v26.8H,  v26.8H, v18.8H
744         sqxtun          v24.8B,  v24.8H
745         sqxtun2         v24.16B, v26.8H
746         st1             {v4.16B},  [x5], x1
747         st1             {v24.16B}, [x5], x1
748         b.ne            1b
749         ret
750 .endm
751
752 .macro  weight_8        add
753         dup             v0.8B,  w4
754 1:      subs            w2,  w2,  #2
755         ld1             {v4.8B}, [x0], x1
756         umull           v2.8H,  v0.8B,  v4.8B
757         ld1             {v6.8B}, [x0], x1
758         umull           v20.8H, v0.8B,  v6.8B
759         \add            v2.8H,  v16.8H,  v2.8H
760         srshl           v2.8H,  v2.8H,  v18.8H
761         sqxtun          v2.8B,  v2.8H
762         \add            v20.8H, v16.8H,  v20.8H
763         srshl           v20.8H, v20.8H, v18.8H
764         sqxtun          v4.8B,  v20.8H
765         st1             {v2.8B}, [x5], x1
766         st1             {v4.8B}, [x5], x1
767         b.ne            1b
768         ret
769 .endm
770
771 .macro  weight_4        add
772         dup             v0.8B,  w4
773 1:      subs            w2,  w2,  #4
774         ld1             {v4.S}[0], [x0], x1
775         ld1             {v4.S}[1], [x0], x1
776         umull           v2.8H,  v0.8B,  v4.8B
777         b.lt            2f
778         ld1             {v6.S}[0], [x0], x1
779         ld1             {v6.S}[1], [x0], x1
780         umull           v20.8H, v0.8B,  v6.8B
781         \add            v2.8H,  v16.8H,  v2.8H
782         srshl           v2.8H,  v2.8H,  v18.8H
783         sqxtun          v2.8B,  v2.8H
784         \add            v20.8H, v16.8H,  v20.8H
785         srshl           v20.8H, v20.8h, v18.8H
786         sqxtun          v4.8B,  v20.8H
787         st1             {v2.S}[0], [x5], x1
788         st1             {v2.S}[1], [x5], x1
789         st1             {v4.S}[0], [x5], x1
790         st1             {v4.S}[1], [x5], x1
791         b.ne            1b
792         ret
793 2:      \add            v2.8H,  v16.8H,  v2.8H
794         srshl           v2.8H,  v2.8H,  v18.8H
795         sqxtun          v2.8B,  v2.8H
796         st1             {v2.S}[0], [x5], x1
797         st1             {v2.S}[1], [x5], x1
798         ret
799 .endm
800
801 .macro  weight_func     w
802 function ff_weight_h264_pixels_\w\()_neon, export=1
803         sxtw            x1,  w1
804         cmp             w3,  #1
805         mov             w6,  #1
806         lsl             w5,  w5,  w3
807         dup             v16.8H,  w5
808         mov             x5,  x0
809         b.le            20f
810         sub             w6,  w6,  w3
811         dup             v18.8H,  w6
812         cmp             w4, #0
813         b.lt            10f
814         weight_\w       shadd
815 10:     neg             w4,  w4
816         weight_\w       shsub
817 20:     neg             w6,  w3
818         dup             v18.8H,  w6
819         cmp             w4,  #0
820         b.lt            10f
821         weight_\w       add
822 10:     neg             w4,  w4
823         weight_\w       sub
824 endfunc
825 .endm
826
827         weight_func     16
828         weight_func     8
829         weight_func     4