]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264dsp_neon.S
Merge commit '28a8b5413b64b831dfb8650208bccd8b78360484'
[ffmpeg] / libavcodec / aarch64 / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/aarch64/asm.S"
24 #include "neon.S"
25
26 .macro  h264_loop_filter_start
27         cmp             w2,  #0
28         ldr             w6,  [x4]
29         ccmp            w3,  #0, #0, ne
30         mov             v24.S[0], w6
31         and             w6,  w6,  w6,  lsl #16
32         b.eq            1f
33         ands            w6,  w6,  w6,  lsl #8
34         b.ge            2f
35 1:
36         ret
37 2:
38 .endm
39
40 .macro  h264_loop_filter_luma
41         dup             v22.16B, w2                     // alpha
42         uxtl            v24.8H,  v24.8B
43         uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
44         uxtl            v24.4S,  v24.4H
45         uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
46         sli             v24.8H,  v24.8H,  #8
47         uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
48         sli             v24.4S,  v24.4S,  #16
49         cmhi            v21.16B, v22.16B, v21.16B       // < alpha
50         dup             v22.16B, w3                     // beta
51         cmlt            v23.16B, v24.16B, #0
52         cmhi            v28.16B, v22.16B, v28.16B       // < beta
53         cmhi            v30.16B, v22.16B, v30.16B       // < beta
54         bic             v21.16B, v21.16B, v23.16B
55         uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
56         and             v21.16B, v21.16B, v28.16B
57         uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
58         and             v21.16B, v21.16B, v30.16B      // < beta
59         shrn            v30.8b,  v21.8h,  #4
60         mov             x7, v30.d[0]
61         cmhi            v17.16B, v22.16B, v17.16B       // < beta
62         cmhi            v19.16B, v22.16B, v19.16B       // < beta
63         cbz             x7,  9f
64         and             v17.16B, v17.16B, v21.16B
65         and             v19.16B, v19.16B, v21.16B
66         and             v24.16B, v24.16B, v21.16B
67         urhadd          v28.16B, v16.16B,  v0.16B
68         sub             v21.16B, v24.16B, v17.16B
69         uqadd           v23.16B, v18.16B, v24.16B
70         uhadd           v20.16B, v20.16B, v28.16B
71         sub             v21.16B, v21.16B, v19.16B
72         uhadd           v28.16B,  v4.16B, v28.16B
73         umin            v23.16B, v23.16B, v20.16B
74         uqsub           v22.16B, v18.16B, v24.16B
75         uqadd           v4.16B,   v2.16B, v24.16B
76         umax            v23.16B, v23.16B, v22.16B
77         uqsub           v22.16B,  v2.16B, v24.16B
78         umin            v28.16B,  v4.16B, v28.16B
79         uxtl            v4.8H,    v0.8B
80         umax            v28.16B, v28.16B, v22.16B
81         uxtl2           v20.8H,   v0.16B
82         usubw           v4.8H,    v4.8H,  v16.8B
83         usubw2          v20.8H,  v20.8H,  v16.16B
84         shl             v4.8H,    v4.8H,  #2
85         shl             v20.8H,  v20.8H,  #2
86         uaddw           v4.8H,    v4.8H,  v18.8B
87         uaddw2          v20.8H,  v20.8H,  v18.16B
88         usubw           v4.8H,    v4.8H,   v2.8B
89         usubw2          v20.8H,  v20.8H,   v2.16B
90         rshrn           v4.8B,    v4.8H,  #3
91         rshrn2          v4.16B,  v20.8H,  #3
92         bsl             v17.16B, v23.16B, v18.16B
93         bsl             v19.16B, v28.16B,  v2.16B
94         neg             v23.16B, v21.16B
95         uxtl            v28.8H,  v16.8B
96         smin            v4.16B,   v4.16B, v21.16B
97         uxtl2           v21.8H,  v16.16B
98         smax            v4.16B,   v4.16B, v23.16B
99         uxtl            v22.8H,   v0.8B
100         uxtl2           v24.8H,   v0.16B
101         saddw           v28.8H,  v28.8H,  v4.8B
102         saddw2          v21.8H,  v21.8H,  v4.16B
103         ssubw           v22.8H,  v22.8H,  v4.8B
104         ssubw2          v24.8H,  v24.8H,  v4.16B
105         sqxtun          v16.8B,  v28.8H
106         sqxtun2         v16.16B, v21.8H
107         sqxtun          v0.8B,   v22.8H
108         sqxtun2         v0.16B,  v24.8H
109 .endm
110
111 function ff_h264_v_loop_filter_luma_neon, export=1
112         h264_loop_filter_start
113         sxtw            x1,  w1
114
115         ld1             {v0.16B},  [x0], x1
116         ld1             {v2.16B},  [x0], x1
117         ld1             {v4.16B},  [x0], x1
118         sub             x0,  x0,  x1, lsl #2
119         sub             x0,  x0,  x1, lsl #1
120         ld1             {v20.16B},  [x0], x1
121         ld1             {v18.16B},  [x0], x1
122         ld1             {v16.16B},  [x0], x1
123
124         h264_loop_filter_luma
125
126         sub             x0,  x0,  x1, lsl #1
127         st1             {v17.16B},  [x0], x1
128         st1             {v16.16B}, [x0], x1
129         st1             {v0.16B},  [x0], x1
130         st1             {v19.16B}, [x0]
131 9:
132         ret
133 endfunc
134
135 function ff_h264_h_loop_filter_luma_neon, export=1
136         h264_loop_filter_start
137         sxtw            x1,  w1
138
139         sub             x0,  x0,  #4
140         ld1             {v6.8B},  [x0], x1
141         ld1             {v20.8B}, [x0], x1
142         ld1             {v18.8B}, [x0], x1
143         ld1             {v16.8B}, [x0], x1
144         ld1             {v0.8B},  [x0], x1
145         ld1             {v2.8B},  [x0], x1
146         ld1             {v4.8B},  [x0], x1
147         ld1             {v26.8B}, [x0], x1
148         ld1             {v6.D}[1],  [x0], x1
149         ld1             {v20.D}[1], [x0], x1
150         ld1             {v18.D}[1], [x0], x1
151         ld1             {v16.D}[1], [x0], x1
152         ld1             {v0.D}[1],  [x0], x1
153         ld1             {v2.D}[1],  [x0], x1
154         ld1             {v4.D}[1],  [x0], x1
155         ld1             {v26.D}[1], [x0], x1
156
157         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
158
159         h264_loop_filter_luma
160
161         transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
162
163         sub             x0,  x0,  x1, lsl #4
164         add             x0,  x0,  #2
165         st1             {v17.S}[0],  [x0], x1
166         st1             {v16.S}[0], [x0], x1
167         st1             {v0.S}[0],  [x0], x1
168         st1             {v19.S}[0], [x0], x1
169         st1             {v17.S}[1],  [x0], x1
170         st1             {v16.S}[1], [x0], x1
171         st1             {v0.S}[1],  [x0], x1
172         st1             {v19.S}[1], [x0], x1
173         st1             {v17.S}[2],  [x0], x1
174         st1             {v16.S}[2], [x0], x1
175         st1             {v0.S}[2],  [x0], x1
176         st1             {v19.S}[2], [x0], x1
177         st1             {v17.S}[3],  [x0], x1
178         st1             {v16.S}[3], [x0], x1
179         st1             {v0.S}[3],  [x0], x1
180         st1             {v19.S}[3], [x0], x1
181 9:
182         ret
183 endfunc
184
185
186 .macro h264_loop_filter_start_intra
187     orr             w4,  w2,  w3
188     cbnz            w4,  1f
189     ret
190 1:
191     sxtw            x1,  w1
192     dup             v30.16b, w2                // alpha
193     dup             v31.16b, w3                // beta
194 .endm
195
196 .macro h264_loop_filter_luma_intra
197     uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
198     uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
199     uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
200     cmhi            v19.16b, v30.16b, v16.16b       // < alpha
201     cmhi            v17.16b, v31.16b, v17.16b       // < beta
202     cmhi            v18.16b, v31.16b, v18.16b       // < beta
203
204     movi            v29.16b, #2
205     ushr            v30.16b, v30.16b, #2            // alpha >> 2
206     add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
207     cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
208
209     and             v19.16b, v19.16b, v17.16b
210     and             v19.16b, v19.16b, v18.16b
211     shrn            v20.8b,  v19.8h,  #4
212     mov             x4, v20.d[0]
213     cbz             x4, 9f
214
215     ushll           v20.8h,  v6.8b,   #1
216     ushll           v22.8h,  v1.8b,   #1
217     ushll2          v21.8h,  v6.16b,  #1
218     ushll2          v23.8h,  v1.16b,  #1
219     uaddw           v20.8h,  v20.8h,  v7.8b
220     uaddw           v22.8h,  v22.8h,  v0.8b
221     uaddw2          v21.8h,  v21.8h,  v7.16b
222     uaddw2          v23.8h,  v23.8h,  v0.16b
223     uaddw           v20.8h,  v20.8h,  v1.8b
224     uaddw           v22.8h,  v22.8h,  v6.8b
225     uaddw2          v21.8h,  v21.8h,  v1.16b
226     uaddw2          v23.8h,  v23.8h,  v6.16b
227
228     rshrn           v24.8b,  v20.8h,  #2 // p0'_1
229     rshrn           v25.8b,  v22.8h,  #2 // q0'_1
230     rshrn2          v24.16b, v21.8h,  #2 // p0'_1
231     rshrn2          v25.16b, v23.8h,  #2 // q0'_1
232
233     uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
234     uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
235     cmhi            v17.16b, v31.16b, v17.16b       // < beta
236     cmhi            v18.16b, v31.16b, v18.16b       // < beta
237
238     and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
239     and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
240
241     not             v30.16b, v17.16b
242     not             v31.16b, v18.16b
243
244     and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
245     and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
246
247     and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
248     and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
249
250     //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
251     uaddl           v26.8h,  v5.8b,   v7.8b
252     uaddl2          v27.8h,  v5.16b,  v7.16b
253     uaddw           v26.8h,  v26.8h,  v0.8b
254     uaddw2          v27.8h,  v27.8h,  v0.16b
255     add             v20.8h,  v20.8h,  v26.8h
256     add             v21.8h,  v21.8h,  v27.8h
257     uaddw           v20.8h,  v20.8h,  v0.8b
258     uaddw2          v21.8h,  v21.8h,  v0.16b
259     rshrn           v20.8b,  v20.8h,  #3 // p0'_2
260     rshrn2          v20.16b, v21.8h,  #3 // p0'_2
261     uaddw           v26.8h,  v26.8h,  v6.8b
262     uaddw2          v27.8h,  v27.8h,  v6.16b
263     rshrn           v21.8b,  v26.8h,  #2 // p1'_2
264     rshrn2          v21.16b, v27.8h,  #2 // p1'_2
265     uaddl           v28.8h,  v4.8b,   v5.8b
266     uaddl2          v29.8h,  v4.16b,  v5.16b
267     shl             v28.8h,  v28.8h,  #1
268     shl             v29.8h,  v29.8h,  #1
269     add             v28.8h,  v28.8h,  v26.8h
270     add             v29.8h,  v29.8h,  v27.8h
271     rshrn           v19.8b,  v28.8h,  #3 // p2'_2
272     rshrn2          v19.16b, v29.8h,  #3 // p2'_2
273
274     //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
275     uaddl           v26.8h,  v2.8b,   v0.8b
276     uaddl2          v27.8h,  v2.16b,  v0.16b
277     uaddw           v26.8h,  v26.8h,  v7.8b
278     uaddw2          v27.8h,  v27.8h,  v7.16b
279     add             v22.8h,  v22.8h,  v26.8h
280     add             v23.8h,  v23.8h,  v27.8h
281     uaddw           v22.8h,  v22.8h,  v7.8b
282     uaddw2          v23.8h,  v23.8h,  v7.16b
283     rshrn           v22.8b,  v22.8h,  #3 // q0'_2
284     rshrn2          v22.16b, v23.8h,  #3 // q0'_2
285     uaddw           v26.8h,  v26.8h,  v1.8b
286     uaddw2          v27.8h,  v27.8h,  v1.16b
287     rshrn           v23.8b,  v26.8h,  #2 // q1'_2
288     rshrn2          v23.16b, v27.8h,  #2 // q1'_2
289     uaddl           v28.8h,  v2.8b,   v3.8b
290     uaddl2          v29.8h,  v2.16b,  v3.16b
291     shl             v28.8h,  v28.8h,  #1
292     shl             v29.8h,  v29.8h,  #1
293     add             v28.8h,  v28.8h,  v26.8h
294     add             v29.8h,  v29.8h,  v27.8h
295     rshrn           v26.8b,  v28.8h,  #3 // q2'_2
296     rshrn2          v26.16b, v29.8h,  #3 // q2'_2
297
298     bit             v7.16b,  v24.16b, v30.16b  // p0'_1
299     bit             v0.16b,  v25.16b, v31.16b  // q0'_1
300     bit             v7.16b, v20.16b,  v17.16b  // p0'_2
301     bit             v6.16b, v21.16b,  v17.16b  // p1'_2
302     bit             v5.16b, v19.16b,  v17.16b  // p2'_2
303     bit             v0.16b, v22.16b,  v18.16b  // q0'_2
304     bit             v1.16b, v23.16b,  v18.16b  // q1'_2
305     bit             v2.16b, v26.16b,  v18.16b  // q2'_2
306 .endm
307
308 function ff_h264_v_loop_filter_luma_intra_neon, export=1
309     h264_loop_filter_start_intra
310
311     ld1             {v0.16b},  [x0], x1 // q0
312     ld1             {v1.16b},  [x0], x1 // q1
313     ld1             {v2.16b},  [x0], x1 // q2
314     ld1             {v3.16b},  [x0], x1 // q3
315     sub             x0,  x0,  x1, lsl #3
316     ld1             {v4.16b},  [x0], x1 // p3
317     ld1             {v5.16b},  [x0], x1 // p2
318     ld1             {v6.16b},  [x0], x1 // p1
319     ld1             {v7.16b},  [x0]     // p0
320
321     h264_loop_filter_luma_intra
322
323     sub             x0,  x0,  x1, lsl #1
324     st1             {v5.16b}, [x0], x1  // p2
325     st1             {v6.16b}, [x0], x1  // p1
326     st1             {v7.16b}, [x0], x1  // p0
327     st1             {v0.16b}, [x0], x1  // q0
328     st1             {v1.16b}, [x0], x1  // q1
329     st1             {v2.16b}, [x0]      // q2
330 9:
331     ret
332 endfunc
333
334 function ff_h264_h_loop_filter_luma_intra_neon, export=1
335     h264_loop_filter_start_intra
336
337     sub             x0,  x0,  #4
338     ld1             {v4.8b},  [x0], x1
339     ld1             {v5.8b},  [x0], x1
340     ld1             {v6.8b},  [x0], x1
341     ld1             {v7.8b},  [x0], x1
342     ld1             {v0.8b},  [x0], x1
343     ld1             {v1.8b},  [x0], x1
344     ld1             {v2.8b},  [x0], x1
345     ld1             {v3.8b},  [x0], x1
346     ld1             {v4.d}[1],  [x0], x1
347     ld1             {v5.d}[1],  [x0], x1
348     ld1             {v6.d}[1],  [x0], x1
349     ld1             {v7.d}[1],  [x0], x1
350     ld1             {v0.d}[1],  [x0], x1
351     ld1             {v1.d}[1],  [x0], x1
352     ld1             {v2.d}[1],  [x0], x1
353     ld1             {v3.d}[1],  [x0], x1
354
355     transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
356
357     h264_loop_filter_luma_intra
358
359     transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
360
361     sub             x0,  x0,  x1, lsl #4
362     st1             {v4.8b},  [x0], x1
363     st1             {v5.8b},  [x0], x1
364     st1             {v6.8b},  [x0], x1
365     st1             {v7.8b},  [x0], x1
366     st1             {v0.8b},  [x0], x1
367     st1             {v1.8b},  [x0], x1
368     st1             {v2.8b},  [x0], x1
369     st1             {v3.8b},  [x0], x1
370     st1             {v4.d}[1],  [x0], x1
371     st1             {v5.d}[1],  [x0], x1
372     st1             {v6.d}[1],  [x0], x1
373     st1             {v7.d}[1],  [x0], x1
374     st1             {v0.d}[1],  [x0], x1
375     st1             {v1.d}[1],  [x0], x1
376     st1             {v2.d}[1],  [x0], x1
377     st1             {v3.d}[1],  [x0], x1
378 9:
379     ret
380 endfunc
381
382 .macro  h264_loop_filter_chroma
383         dup             v22.8B, w2              // alpha
384         dup             v23.8B, w3              // beta
385         uxtl            v24.8H, v24.8B
386         uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
387         uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
388         uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
389         cmhi            v26.8B, v22.8B, v26.8B  // < alpha
390         cmhi            v28.8B, v23.8B, v28.8B  // < beta
391         cmhi            v30.8B, v23.8B, v30.8B  // < beta
392         uxtl            v4.8H,  v0.8B
393         and             v26.8B, v26.8B, v28.8B
394         usubw           v4.8H,  v4.8H,  v16.8B
395         and             v26.8B, v26.8B, v30.8B
396         shl             v4.8H,  v4.8H,  #2
397         mov             x2,  v26.d[0]
398         sli             v24.8H, v24.8H, #8
399         uaddw           v4.8H,  v4.8H,  v18.8B
400         cbz             x2,  9f
401         usubw           v4.8H,  v4.8H,  v2.8B
402         rshrn           v4.8B,  v4.8H,  #3
403         smin            v4.8B,  v4.8B,  v24.8B
404         neg             v25.8B, v24.8B
405         smax            v4.8B,  v4.8B,  v25.8B
406         uxtl            v22.8H, v0.8B
407         and             v4.8B,  v4.8B,  v26.8B
408         uxtl            v28.8H, v16.8B
409         saddw           v28.8H, v28.8H, v4.8B
410         ssubw           v22.8H, v22.8H, v4.8B
411         sqxtun          v16.8B, v28.8H
412         sqxtun          v0.8B,  v22.8H
413 .endm
414
415 function ff_h264_v_loop_filter_chroma_neon, export=1
416         h264_loop_filter_start
417         sxtw            x1,  w1
418
419         sub             x0,  x0,  x1, lsl #1
420         ld1             {v18.8B}, [x0], x1
421         ld1             {v16.8B}, [x0], x1
422         ld1             {v0.8B},  [x0], x1
423         ld1             {v2.8B},  [x0]
424
425         h264_loop_filter_chroma
426
427         sub             x0,  x0,  x1, lsl #1
428         st1             {v16.8B}, [x0], x1
429         st1             {v0.8B},  [x0], x1
430 9:
431         ret
432 endfunc
433
434 function ff_h264_h_loop_filter_chroma_neon, export=1
435         h264_loop_filter_start
436         sxtw            x1,  w1
437
438         sub             x0,  x0,  #2
439         ld1             {v18.S}[0], [x0], x1
440         ld1             {v16.S}[0], [x0], x1
441         ld1             {v0.S}[0],  [x0], x1
442         ld1             {v2.S}[0],  [x0], x1
443         ld1             {v18.S}[1], [x0], x1
444         ld1             {v16.S}[1], [x0], x1
445         ld1             {v0.S}[1],  [x0], x1
446         ld1             {v2.S}[1],  [x0], x1
447
448         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
449
450         h264_loop_filter_chroma
451
452         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
453
454         sub             x0,  x0,  x1, lsl #3
455         st1             {v18.S}[0], [x0], x1
456         st1             {v16.S}[0], [x0], x1
457         st1             {v0.S}[0],  [x0], x1
458         st1             {v2.S}[0],  [x0], x1
459         st1             {v18.S}[1], [x0], x1
460         st1             {v16.S}[1], [x0], x1
461         st1             {v0.S}[1],  [x0], x1
462         st1             {v2.S}[1],  [x0], x1
463 9:
464         ret
465 endfunc
466
467
468 .macro h264_loop_filter_chroma_intra
469     uabd            v26.8b, v16.8b, v17.8b  // abs(p0 - q0)
470     uabd            v27.8b, v18.8b, v16.8b  // abs(p1 - p0)
471     uabd            v28.8b, v19.8b, v17.8b  // abs(q1 - q0)
472     cmhi            v26.8b, v30.8b, v26.8b  // < alpha
473     cmhi            v27.8b, v31.8b, v27.8b  // < beta
474     cmhi            v28.8b, v31.8b, v28.8b  // < beta
475     and             v26.8b, v26.8b, v27.8b
476     and             v26.8b, v26.8b, v28.8b
477     mov             x2, v26.d[0]
478
479     ushll           v4.8h,   v18.8b,  #1
480     ushll           v6.8h,   v19.8b,  #1
481     cbz             x2, 9f
482     uaddl           v20.8h,  v16.8b,  v19.8b
483     uaddl           v22.8h,  v17.8b,  v18.8b
484     add             v20.8h,  v20.8h,  v4.8h
485     add             v22.8h,  v22.8h,  v6.8h
486     uqrshrn         v24.8b,  v20.8h,  #2
487     uqrshrn         v25.8b,  v22.8h,  #2
488     bit             v16.8b, v24.8b, v26.8b
489     bit             v17.8b, v25.8b, v26.8b
490 .endm
491
492 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
493     h264_loop_filter_start_intra
494
495     sub             x0,  x0,  x1, lsl #1
496     ld1             {v18.8b}, [x0], x1
497     ld1             {v16.8b}, [x0], x1
498     ld1             {v17.8b}, [x0], x1
499     ld1             {v19.8b}, [x0]
500
501     h264_loop_filter_chroma_intra
502
503     sub             x0,  x0,  x1, lsl #1
504     st1             {v16.8b}, [x0], x1
505     st1             {v17.8b}, [x0], x1
506
507 9:
508     ret
509 endfunc
510
511 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
512     h264_loop_filter_start_intra
513
514     sub             x4,  x0,  #2
515     sub             x0,  x0,  #1
516     ld1             {v18.8b}, [x4], x1
517     ld1             {v16.8b}, [x4], x1
518     ld1             {v17.8b}, [x4], x1
519     ld1             {v19.8b}, [x4], x1
520
521     transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
522
523     h264_loop_filter_chroma_intra
524
525     st2             {v16.b,v17.b}[0], [x0], x1
526     st2             {v16.b,v17.b}[1], [x0], x1
527     st2             {v16.b,v17.b}[2], [x0], x1
528     st2             {v16.b,v17.b}[3], [x0], x1
529
530 9:
531     ret
532 endfunc
533
534 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
535     h264_loop_filter_start_intra
536
537     sub             x4,  x0,  #2
538     sub             x0,  x0,  #1
539     ld1             {v18.8b}, [x4], x1
540     ld1             {v16.8b}, [x4], x1
541     ld1             {v17.8b}, [x4], x1
542     ld1             {v19.8b}, [x4], x1
543     ld1             {v18.s}[1], [x4], x1
544     ld1             {v16.s}[1], [x4], x1
545     ld1             {v17.s}[1], [x4], x1
546     ld1             {v19.s}[1], [x4]
547
548     transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
549
550     h264_loop_filter_chroma_intra
551
552     st2             {v16.b,v17.b}[0], [x0], x1
553     st2             {v16.b,v17.b}[1], [x0], x1
554     st2             {v16.b,v17.b}[2], [x0], x1
555     st2             {v16.b,v17.b}[3], [x0], x1
556     st2             {v16.b,v17.b}[4], [x0], x1
557     st2             {v16.b,v17.b}[5], [x0], x1
558     st2             {v16.b,v17.b}[6], [x0], x1
559     st2             {v16.b,v17.b}[7], [x0], x1
560
561 9:
562     ret
563 endfunc
564
565
566 .macro  biweight_16     macs, macd
567         dup             v0.16B,  w5
568         dup             v1.16B,  w6
569         mov             v4.16B,  v16.16B
570         mov             v6.16B,  v16.16B
571 1:      subs            w3,  w3,  #2
572         ld1             {v20.16B}, [x0], x2
573         \macd           v4.8H,   v0.8B,  v20.8B
574         \macd\()2       v6.8H,   v0.16B, v20.16B
575         ld1             {v22.16B}, [x1], x2
576         \macs           v4.8H,   v1.8B,  v22.8B
577         \macs\()2       v6.8H,   v1.16B, v22.16B
578         mov             v24.16B, v16.16B
579         ld1             {v28.16B}, [x0], x2
580         mov             v26.16B, v16.16B
581         \macd           v24.8H,  v0.8B,  v28.8B
582         \macd\()2       v26.8H,  v0.16B, v28.16B
583         ld1             {v30.16B}, [x1], x2
584         \macs           v24.8H,  v1.8B,  v30.8B
585         \macs\()2       v26.8H,  v1.16B, v30.16B
586         sshl            v4.8H,   v4.8H,  v18.8H
587         sshl            v6.8H,   v6.8H,  v18.8H
588         sqxtun          v4.8B,   v4.8H
589         sqxtun2         v4.16B,  v6.8H
590         sshl            v24.8H,  v24.8H, v18.8H
591         sshl            v26.8H,  v26.8H, v18.8H
592         sqxtun          v24.8B,  v24.8H
593         sqxtun2         v24.16B, v26.8H
594         mov             v6.16B,  v16.16B
595         st1             {v4.16B},  [x7], x2
596         mov             v4.16B,  v16.16B
597         st1             {v24.16B}, [x7], x2
598         b.ne            1b
599         ret
600 .endm
601
602 .macro  biweight_8      macs, macd
603         dup             v0.8B,  w5
604         dup             v1.8B,  w6
605         mov             v2.16B,  v16.16B
606         mov             v20.16B, v16.16B
607 1:      subs            w3,  w3,  #2
608         ld1             {v4.8B}, [x0], x2
609         \macd           v2.8H,  v0.8B,  v4.8B
610         ld1             {v5.8B}, [x1], x2
611         \macs           v2.8H,  v1.8B,  v5.8B
612         ld1             {v6.8B}, [x0], x2
613         \macd           v20.8H, v0.8B,  v6.8B
614         ld1             {v7.8B}, [x1], x2
615         \macs           v20.8H, v1.8B,  v7.8B
616         sshl            v2.8H,  v2.8H,  v18.8H
617         sqxtun          v2.8B,  v2.8H
618         sshl            v20.8H, v20.8H, v18.8H
619         sqxtun          v4.8B,  v20.8H
620         mov             v20.16B, v16.16B
621         st1             {v2.8B}, [x7], x2
622         mov             v2.16B,  v16.16B
623         st1             {v4.8B}, [x7], x2
624         b.ne            1b
625         ret
626 .endm
627
628 .macro  biweight_4      macs, macd
629         dup             v0.8B,  w5
630         dup             v1.8B,  w6
631         mov             v2.16B, v16.16B
632         mov             v20.16B,v16.16B
633 1:      subs            w3,  w3,  #4
634         ld1             {v4.S}[0], [x0], x2
635         ld1             {v4.S}[1], [x0], x2
636         \macd           v2.8H,  v0.8B,  v4.8B
637         ld1             {v5.S}[0], [x1], x2
638         ld1             {v5.S}[1], [x1], x2
639         \macs           v2.8H,  v1.8B,  v5.8B
640         b.lt            2f
641         ld1             {v6.S}[0], [x0], x2
642         ld1             {v6.S}[1], [x0], x2
643         \macd           v20.8H, v0.8B,  v6.8B
644         ld1             {v7.S}[0], [x1], x2
645         ld1             {v7.S}[1], [x1], x2
646         \macs           v20.8H, v1.8B,  v7.8B
647         sshl            v2.8H,  v2.8H,  v18.8H
648         sqxtun          v2.8B,  v2.8H
649         sshl            v20.8H, v20.8H, v18.8H
650         sqxtun          v4.8B,  v20.8H
651         mov             v20.16B, v16.16B
652         st1             {v2.S}[0], [x7], x2
653         st1             {v2.S}[1], [x7], x2
654         mov             v2.16B,  v16.16B
655         st1             {v4.S}[0], [x7], x2
656         st1             {v4.S}[1], [x7], x2
657         b.ne            1b
658         ret
659 2:      sshl            v2.8H,  v2.8H,  v18.8H
660         sqxtun          v2.8B,  v2.8H
661         st1             {v2.S}[0], [x7], x2
662         st1             {v2.S}[1], [x7], x2
663         ret
664 .endm
665
666 .macro  biweight_func   w
667 function ff_biweight_h264_pixels_\w\()_neon, export=1
668         sxtw            x2,  w2
669         lsr             w8,  w5,  #31
670         add             w7,  w7,  #1
671         eor             w8,  w8,  w6,  lsr #30
672         orr             w7,  w7,  #1
673         dup             v18.8H,   w4
674         lsl             w7,  w7,  w4
675         not             v18.16B,  v18.16B
676         dup             v16.8H,   w7
677         mov             x7,  x0
678         cbz             w8,  10f
679         subs            w8,  w8,  #1
680         b.eq            20f
681         subs            w8,  w8,  #1
682         b.eq            30f
683         b               40f
684 10:     biweight_\w     umlal, umlal
685 20:     neg             w5, w5
686         biweight_\w     umlal, umlsl
687 30:     neg             w5, w5
688         neg             w6, w6
689         biweight_\w     umlsl, umlsl
690 40:     neg             w6, w6
691         biweight_\w     umlsl, umlal
692 endfunc
693 .endm
694
695         biweight_func   16
696         biweight_func   8
697         biweight_func   4
698
699 .macro  weight_16       add
700         dup             v0.16B,  w4
701 1:      subs            w2,  w2,  #2
702         ld1             {v20.16B}, [x0], x1
703         umull           v4.8H,   v0.8B,  v20.8B
704         umull2          v6.8H,   v0.16B, v20.16B
705         ld1             {v28.16B}, [x0], x1
706         umull           v24.8H,  v0.8B,  v28.8B
707         umull2          v26.8H,  v0.16B, v28.16B
708         \add            v4.8H,   v16.8H, v4.8H
709         srshl           v4.8H,   v4.8H,  v18.8H
710         \add            v6.8H,   v16.8H, v6.8H
711         srshl           v6.8H,   v6.8H,  v18.8H
712         sqxtun          v4.8B,   v4.8H
713         sqxtun2         v4.16B,  v6.8H
714         \add            v24.8H,  v16.8H, v24.8H
715         srshl           v24.8H,  v24.8H, v18.8H
716         \add            v26.8H,  v16.8H, v26.8H
717         srshl           v26.8H,  v26.8H, v18.8H
718         sqxtun          v24.8B,  v24.8H
719         sqxtun2         v24.16B, v26.8H
720         st1             {v4.16B},  [x5], x1
721         st1             {v24.16B}, [x5], x1
722         b.ne            1b
723         ret
724 .endm
725
726 .macro  weight_8        add
727         dup             v0.8B,  w4
728 1:      subs            w2,  w2,  #2
729         ld1             {v4.8B}, [x0], x1
730         umull           v2.8H,  v0.8B,  v4.8B
731         ld1             {v6.8B}, [x0], x1
732         umull           v20.8H, v0.8B,  v6.8B
733         \add            v2.8H,  v16.8H,  v2.8H
734         srshl           v2.8H,  v2.8H,  v18.8H
735         sqxtun          v2.8B,  v2.8H
736         \add            v20.8H, v16.8H,  v20.8H
737         srshl           v20.8H, v20.8H, v18.8H
738         sqxtun          v4.8B,  v20.8H
739         st1             {v2.8B}, [x5], x1
740         st1             {v4.8B}, [x5], x1
741         b.ne            1b
742         ret
743 .endm
744
745 .macro  weight_4        add
746         dup             v0.8B,  w4
747 1:      subs            w2,  w2,  #4
748         ld1             {v4.S}[0], [x0], x1
749         ld1             {v4.S}[1], [x0], x1
750         umull           v2.8H,  v0.8B,  v4.8B
751         b.lt            2f
752         ld1             {v6.S}[0], [x0], x1
753         ld1             {v6.S}[1], [x0], x1
754         umull           v20.8H, v0.8B,  v6.8B
755         \add            v2.8H,  v16.8H,  v2.8H
756         srshl           v2.8H,  v2.8H,  v18.8H
757         sqxtun          v2.8B,  v2.8H
758         \add            v20.8H, v16.8H,  v20.8H
759         srshl           v20.8H, v20.8h, v18.8H
760         sqxtun          v4.8B,  v20.8H
761         st1             {v2.S}[0], [x5], x1
762         st1             {v2.S}[1], [x5], x1
763         st1             {v4.S}[0], [x5], x1
764         st1             {v4.S}[1], [x5], x1
765         b.ne            1b
766         ret
767 2:      \add            v2.8H,  v16.8H,  v2.8H
768         srshl           v2.8H,  v2.8H,  v18.8H
769         sqxtun          v2.8B,  v2.8H
770         st1             {v2.S}[0], [x5], x1
771         st1             {v2.S}[1], [x5], x1
772         ret
773 .endm
774
775 .macro  weight_func     w
776 function ff_weight_h264_pixels_\w\()_neon, export=1
777         sxtw            x1,  w1
778         cmp             w3,  #1
779         mov             w6,  #1
780         lsl             w5,  w5,  w3
781         dup             v16.8H,  w5
782         mov             x5,  x0
783         b.le            20f
784         sub             w6,  w6,  w3
785         dup             v18.8H,  w6
786         cmp             w4, #0
787         b.lt            10f
788         weight_\w       shadd
789 10:     neg             w4,  w4
790         weight_\w       shsub
791 20:     neg             w6,  w3
792         dup             v18.8H,  w6
793         cmp             w4,  #0
794         b.lt            10f
795         weight_\w       add
796 10:     neg             w4,  w4
797         weight_\w       sub
798 endfunc
799 .endm
800
801         weight_func     16
802         weight_func     8
803         weight_func     4