]> git.sesse.net Git - x264/blob - common/aarch64/deblock-a.S
dafd8291467104e40f62d52221671fdf68b2f504
[x264] / common / aarch64 / deblock-a.S
1 /*****************************************************************************
2  * deblock.S: aarch64 deblocking
3  *****************************************************************************
4  * Copyright (C) 2009-2015 x264 project
5  *
6  * Authors: Mans Rullgard <mans@mansr.com>
7  *          Janne Grunau <janne-x264@jannau.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26
27 #include "asm.S"
28
29 .macro h264_loop_filter_start
30     cmp             w2,  #0
31     ldr             w6,  [x4]
32     ccmp            w3,  #0, #0, ne
33     mov             v24.s[0], w6
34     and             w8,  w6,  w6,  lsl #16
35     b.eq            1f
36     ands            w8,  w8,  w8,  lsl #8
37     b.ge            2f
38 1:
39     ret
40 2:
41 .endm
42
43 .macro h264_loop_filter_luma
44     dup             v22.16b, w2                     // alpha
45     uxtl            v24.8h,  v24.8b
46     uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
47     uxtl            v24.4s,  v24.4h
48     uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
49     sli             v24.8h,  v24.8h,  #8
50     uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
51     sli             v24.4s,  v24.4s,  #16
52     cmhi            v21.16b, v22.16b, v21.16b       // < alpha
53     dup             v22.16b, w3                     // beta
54     cmlt            v23.16b, v24.16b, #0
55     cmhi            v28.16b, v22.16b, v28.16b       // < beta
56     cmhi            v30.16b, v22.16b, v30.16b       // < beta
57     bic             v21.16b, v21.16b, v23.16b
58     uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
59     and             v21.16b, v21.16b, v28.16b
60     uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
61     cmhi            v17.16b, v22.16b, v17.16b       // < beta
62     and             v21.16b, v21.16b, v30.16b
63     cmhi            v19.16b, v22.16b, v19.16b       // < beta
64     and             v17.16b, v17.16b, v21.16b
65     and             v19.16b, v19.16b, v21.16b
66     and             v24.16b, v24.16b, v21.16b
67     urhadd          v28.16b, v16.16b,  v0.16b
68     sub             v21.16b, v24.16b, v17.16b
69     uqadd           v23.16b, v18.16b, v24.16b
70     uhadd           v20.16b, v20.16b, v28.16b
71     sub             v21.16b, v21.16b, v19.16b
72     uhadd           v28.16b,  v4.16b, v28.16b
73     umin            v23.16b, v23.16b, v20.16b
74     uqsub           v22.16b, v18.16b, v24.16b
75     uqadd           v4.16b,   v2.16b, v24.16b
76     umax            v23.16b, v23.16b, v22.16b
77     uqsub           v22.16b,  v2.16b, v24.16b
78     umin            v28.16b,  v4.16b, v28.16b
79     uxtl            v4.8h,    v0.8b
80     umax            v28.16b, v28.16b, v22.16b
81     uxtl2           v20.8h,   v0.16b
82     usubw           v4.8h,    v4.8h,  v16.8b
83     usubw2          v20.8h,  v20.8h,  v16.16b
84     shl             v4.8h,    v4.8h,  #2
85     shl             v20.8h,  v20.8h,  #2
86     uaddw           v4.8h,    v4.8h,  v18.8b
87     uaddw2          v20.8h,  v20.8h,  v18.16b
88     usubw           v4.8h,    v4.8h,   v2.8b
89     usubw2          v20.8h,  v20.8h,   v2.16b
90     rshrn           v4.8b,    v4.8h,  #3
91     rshrn2          v4.16b,  v20.8h,  #3
92     bsl             v17.16b, v23.16b, v18.16b
93     bsl             v19.16b, v28.16b,  v2.16b
94     neg             v23.16b, v21.16b
95     uxtl            v28.8h,  v16.8b
96     smin            v4.16b,   v4.16b, v21.16b
97     uxtl2           v21.8h,  v16.16b
98     smax            v4.16b,   v4.16b, v23.16b
99     uxtl            v22.8h,   v0.8b
100     uxtl2           v24.8h,   v0.16b
101     saddw           v28.8h,  v28.8h,  v4.8b
102     saddw2          v21.8h,  v21.8h,  v4.16b
103     ssubw           v22.8h,  v22.8h,  v4.8b
104     ssubw2          v24.8h,  v24.8h,  v4.16b
105     sqxtun          v16.8b,  v28.8h
106     sqxtun2         v16.16b, v21.8h
107     sqxtun          v0.8b,   v22.8h
108     sqxtun2         v0.16b,  v24.8h
109 .endm
110
111 function x264_deblock_v_luma_neon, export=1
112     h264_loop_filter_start
113
114     ld1             {v0.16b},  [x0], x1
115     ld1             {v2.16b},  [x0], x1
116     ld1             {v4.16b},  [x0], x1
117     sub             x0,  x0,  x1, lsl #2
118     sub             x0,  x0,  x1, lsl #1
119     ld1             {v20.16b},  [x0], x1
120     ld1             {v18.16b},  [x0], x1
121     ld1             {v16.16b},  [x0], x1
122
123     h264_loop_filter_luma
124
125     sub             x0,  x0,  x1, lsl #1
126     st1             {v17.16b}, [x0], x1
127     st1             {v16.16b}, [x0], x1
128     st1             {v0.16b},  [x0], x1
129     st1             {v19.16b}, [x0]
130
131     ret
132 endfunc
133
134 function x264_deblock_h_luma_neon, export=1
135     h264_loop_filter_start
136
137     sub             x0,  x0,  #4
138     ld1             {v6.8b},  [x0], x1
139     ld1             {v20.8b}, [x0], x1
140     ld1             {v18.8b}, [x0], x1
141     ld1             {v16.8b}, [x0], x1
142     ld1             {v0.8b},  [x0], x1
143     ld1             {v2.8b},  [x0], x1
144     ld1             {v4.8b},  [x0], x1
145     ld1             {v26.8b}, [x0], x1
146     ld1             {v6.d}[1],  [x0], x1
147     ld1             {v20.d}[1], [x0], x1
148     ld1             {v18.d}[1], [x0], x1
149     ld1             {v16.d}[1], [x0], x1
150     ld1             {v0.d}[1],  [x0], x1
151     ld1             {v2.d}[1],  [x0], x1
152     ld1             {v4.d}[1],  [x0], x1
153     ld1             {v26.d}[1], [x0], x1
154
155     transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
156
157     h264_loop_filter_luma
158
159     transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
160
161     sub             x0,  x0,  x1, lsl #4
162     add             x0,  x0,  #2
163     st1             {v17.s}[0],  [x0], x1
164     st1             {v16.s}[0], [x0], x1
165     st1             {v0.s}[0],  [x0], x1
166     st1             {v19.s}[0], [x0], x1
167     st1             {v17.s}[1],  [x0], x1
168     st1             {v16.s}[1], [x0], x1
169     st1             {v0.s}[1],  [x0], x1
170     st1             {v19.s}[1], [x0], x1
171     st1             {v17.s}[2],  [x0], x1
172     st1             {v16.s}[2], [x0], x1
173     st1             {v0.s}[2],  [x0], x1
174     st1             {v19.s}[2], [x0], x1
175     st1             {v17.s}[3],  [x0], x1
176     st1             {v16.s}[3], [x0], x1
177     st1             {v0.s}[3],  [x0], x1
178     st1             {v19.s}[3], [x0], x1
179
180     ret
181 endfunc
182
183 .macro h264_loop_filter_start_intra
184     orr             w4,  w2,  w3
185     cmp             w4,  #0
186     b.ne            1f
187     ret
188 1:
189     dup             v30.16b, w2                // alpha
190     dup             v31.16b, w3                // beta
191 .endm
192
193 .macro h264_loop_filter_luma_intra
194     uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
195     uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
196     uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
197     cmhi            v19.16b, v30.16b, v16.16b       // < alpha
198     cmhi            v17.16b, v31.16b, v17.16b       // < beta
199     cmhi            v18.16b, v31.16b, v18.16b       // < beta
200
201     movi            v29.16b, #2
202     ushr            v30.16b, v30.16b, #2            // alpha >> 2
203     add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
204     cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
205
206     and             v19.16b, v19.16b, v17.16b
207     and             v19.16b, v19.16b, v18.16b
208     shrn            v20.8b,  v19.8h,  #4
209     mov             x4, v20.d[0]
210     cbz             x4, 9f
211
212     ushll           v20.8h,  v6.8b,   #1
213     ushll           v22.8h,  v1.8b,   #1
214     ushll2          v21.8h,  v6.16b,  #1
215     ushll2          v23.8h,  v1.16b,  #1
216     uaddw           v20.8h,  v20.8h,  v7.8b
217     uaddw           v22.8h,  v22.8h,  v0.8b
218     uaddw2          v21.8h,  v21.8h,  v7.16b
219     uaddw2          v23.8h,  v23.8h,  v0.16b
220     uaddw           v20.8h,  v20.8h,  v1.8b
221     uaddw           v22.8h,  v22.8h,  v6.8b
222     uaddw2          v21.8h,  v21.8h,  v1.16b
223     uaddw2          v23.8h,  v23.8h,  v6.16b
224
225     rshrn           v24.8b,  v20.8h,  #2 // p0'_1
226     rshrn           v25.8b,  v22.8h,  #2 // q0'_1
227     rshrn2          v24.16b, v21.8h,  #2 // p0'_1
228     rshrn2          v25.16b, v23.8h,  #2 // q0'_1
229
230     uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
231     uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
232     cmhi            v17.16b, v31.16b, v17.16b       // < beta
233     cmhi            v18.16b, v31.16b, v18.16b       // < beta
234
235     and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
236     and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
237
238     not             v30.16b, v17.16b
239     not             v31.16b, v18.16b
240
241     and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
242     and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
243
244     and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
245     and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
246
247     //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248     uaddl           v26.8h,  v5.8b,   v7.8b
249     uaddl2          v27.8h,  v5.16b,  v7.16b
250     uaddw           v26.8h,  v26.8h,  v0.8b
251     uaddw2          v27.8h,  v27.8h,  v0.16b
252     add             v20.8h,  v20.8h,  v26.8h
253     add             v21.8h,  v21.8h,  v27.8h
254     uaddw           v20.8h,  v20.8h,  v0.8b
255     uaddw2          v21.8h,  v21.8h,  v0.16b
256     rshrn           v20.8b,  v20.8h,  #3 // p0'_2
257     rshrn2          v20.16b, v21.8h,  #3 // p0'_2
258     uaddw           v26.8h,  v26.8h,  v6.8b
259     uaddw2          v27.8h,  v27.8h,  v6.16b
260     rshrn           v21.8b,  v26.8h,  #2 // p1'_2
261     rshrn2          v21.16b, v27.8h,  #2 // p1'_2
262     uaddl           v28.8h,  v4.8b,   v5.8b
263     uaddl2          v29.8h,  v4.16b,  v5.16b
264     shl             v28.8h,  v28.8h,  #1
265     shl             v29.8h,  v29.8h,  #1
266     add             v28.8h,  v28.8h,  v26.8h
267     add             v29.8h,  v29.8h,  v27.8h
268     rshrn           v19.8b,  v28.8h,  #3 // p2'_2
269     rshrn2          v19.16b, v29.8h,  #3 // p2'_2
270
271     //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272     uaddl           v26.8h,  v2.8b,   v0.8b
273     uaddl2          v27.8h,  v2.16b,  v0.16b
274     uaddw           v26.8h,  v26.8h,  v7.8b
275     uaddw2          v27.8h,  v27.8h,  v7.16b
276     add             v22.8h,  v22.8h,  v26.8h
277     add             v23.8h,  v23.8h,  v27.8h
278     uaddw           v22.8h,  v22.8h,  v7.8b
279     uaddw2          v23.8h,  v23.8h,  v7.16b
280     rshrn           v22.8b,  v22.8h,  #3 // q0'_2
281     rshrn2          v22.16b, v23.8h,  #3 // q0'_2
282     uaddw           v26.8h,  v26.8h,  v1.8b
283     uaddw2          v27.8h,  v27.8h,  v1.16b
284     rshrn           v23.8b,  v26.8h,  #2 // q1'_2
285     rshrn2          v23.16b, v27.8h,  #2 // q1'_2
286     uaddl           v28.8h,  v2.8b,   v3.8b
287     uaddl2          v29.8h,  v2.16b,  v3.16b
288     shl             v28.8h,  v28.8h,  #1
289     shl             v29.8h,  v29.8h,  #1
290     add             v28.8h,  v28.8h,  v26.8h
291     add             v29.8h,  v29.8h,  v27.8h
292     rshrn           v26.8b,  v28.8h,  #3 // q2'_2
293     rshrn2          v26.16b, v29.8h,  #3 // q2'_2
294
295     bit             v7.16b,  v24.16b, v30.16b  // p0'_1
296     bit             v0.16b,  v25.16b, v31.16b  // q0'_1
297     bit             v7.16b, v20.16b,  v17.16b  // p0'_2
298     bit             v6.16b, v21.16b,  v17.16b  // p1'_2
299     bit             v5.16b, v19.16b,  v17.16b  // p2'_2
300     bit             v0.16b, v22.16b,  v18.16b  // q0'_2
301     bit             v1.16b, v23.16b,  v18.16b  // q1'_2
302     bit             v2.16b, v26.16b,  v18.16b  // q2'_2
303 .endm
304
305 function x264_deblock_v_luma_intra_neon, export=1
306     h264_loop_filter_start_intra
307
308     ld1             {v0.16b},  [x0], x1 // q0
309     ld1             {v1.16b},  [x0], x1 // q1
310     ld1             {v2.16b},  [x0], x1 // q2
311     ld1             {v3.16b},  [x0], x1 // q3
312     sub             x0,  x0,  x1, lsl #3
313     ld1             {v4.16b},  [x0], x1 // p3
314     ld1             {v5.16b},  [x0], x1 // p2
315     ld1             {v6.16b},  [x0], x1 // p1
316     ld1             {v7.16b},  [x0]     // p0
317
318     h264_loop_filter_luma_intra
319
320     sub             x0,  x0,  x1, lsl #1
321     st1             {v5.16b}, [x0], x1  // p2
322     st1             {v6.16b}, [x0], x1  // p1
323     st1             {v7.16b}, [x0], x1  // p0
324     st1             {v0.16b}, [x0], x1  // q0
325     st1             {v1.16b}, [x0], x1  // q1
326     st1             {v2.16b}, [x0]      // q2
327 9:
328     ret
329 endfunc
330
331 function x264_deblock_h_luma_intra_neon, export=1
332     h264_loop_filter_start_intra
333
334     sub             x0,  x0,  #4
335     ld1             {v4.8b},  [x0], x1
336     ld1             {v5.8b},  [x0], x1
337     ld1             {v6.8b},  [x0], x1
338     ld1             {v7.8b},  [x0], x1
339     ld1             {v0.8b},  [x0], x1
340     ld1             {v1.8b},  [x0], x1
341     ld1             {v2.8b},  [x0], x1
342     ld1             {v3.8b},  [x0], x1
343     ld1             {v4.d}[1],  [x0], x1
344     ld1             {v5.d}[1],  [x0], x1
345     ld1             {v6.d}[1],  [x0], x1
346     ld1             {v7.d}[1],  [x0], x1
347     ld1             {v0.d}[1],  [x0], x1
348     ld1             {v1.d}[1],  [x0], x1
349     ld1             {v2.d}[1],  [x0], x1
350     ld1             {v3.d}[1],  [x0], x1
351
352     transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
353
354     h264_loop_filter_luma_intra
355
356     transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357
358     sub             x0,  x0,  x1, lsl #4
359     st1             {v4.8b},  [x0], x1
360     st1             {v5.8b},  [x0], x1
361     st1             {v6.8b},  [x0], x1
362     st1             {v7.8b},  [x0], x1
363     st1             {v0.8b},  [x0], x1
364     st1             {v1.8b},  [x0], x1
365     st1             {v2.8b},  [x0], x1
366     st1             {v3.8b},  [x0], x1
367     st1             {v4.d}[1],  [x0], x1
368     st1             {v5.d}[1],  [x0], x1
369     st1             {v6.d}[1],  [x0], x1
370     st1             {v7.d}[1],  [x0], x1
371     st1             {v0.d}[1],  [x0], x1
372     st1             {v1.d}[1],  [x0], x1
373     st1             {v2.d}[1],  [x0], x1
374     st1             {v3.d}[1],  [x0], x1
375 9:
376     ret
377 endfunc
378
379 .macro h264_loop_filter_chroma
380     dup             v22.16b, w2              // alpha
381     uxtl            v24.8h,  v24.8b
382     uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
383     uxtl            v4.8h,   v0.8b
384     uxtl2           v5.8h,   v0.16b
385     uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
386     usubw           v4.8h,   v4.8h,   v16.8b
387     usubw2          v5.8h,   v5.8h,   v16.16b
388     sli             v24.8h,  v24.8h,  #8
389     shl             v4.8h,   v4.8h,   #2
390     shl             v5.8h,   v5.8h,   #2
391     uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
392     uxtl            v24.4s,  v24.4h
393     uaddw           v4.8h,   v4.8h,   v18.8b
394     uaddw2          v5.8h,   v5.8h,   v18.16b
395     cmhi            v26.16b, v22.16b, v26.16b  // < alpha
396     usubw           v4.8h,   v4.8h,   v2.8b
397     usubw2          v5.8h,   v5.8h,   v2.16b
398     sli             v24.4s,  v24.4s,  #16
399     dup             v22.16b, w3              // beta
400     rshrn           v4.8b,   v4.8h,   #3
401     rshrn2          v4.16b,  v5.8h,   #3
402     cmhi            v28.16b, v22.16b, v28.16b  // < beta
403     cmhi            v30.16b, v22.16b, v30.16b  // < beta
404     smin            v4.16b,  v4.16b,  v24.16b
405     neg             v25.16b, v24.16b
406     and             v26.16b, v26.16b, v28.16b
407     smax            v4.16b,  v4.16b,  v25.16b
408     and             v26.16b, v26.16b, v30.16b
409     uxtl            v22.8h,  v0.8b
410     uxtl2           v23.8h,  v0.16b
411     and             v4.16b,  v4.16b,  v26.16b
412     uxtl            v28.8h,  v16.8b
413     uxtl2           v29.8h,  v16.16b
414     saddw           v28.8h,  v28.8h,  v4.8b
415     saddw2          v29.8h,  v29.8h,  v4.16b
416     ssubw           v22.8h,  v22.8h,  v4.8b
417     ssubw2          v23.8h,  v23.8h,  v4.16b
418     sqxtun          v16.8b,  v28.8h
419     sqxtun          v0.8b,   v22.8h
420     sqxtun2         v16.16b, v29.8h
421     sqxtun2         v0.16b,  v23.8h
422 .endm
423
424 function x264_deblock_v_chroma_neon, export=1
425     h264_loop_filter_start
426
427     sub             x0,  x0,  x1, lsl #1
428     ld1             {v18.16b}, [x0], x1
429     ld1             {v16.16b}, [x0], x1
430     ld1             {v0.16b},  [x0], x1
431     ld1             {v2.16b},  [x0]
432
433     h264_loop_filter_chroma
434
435     sub             x0,  x0,  x1, lsl #1
436     st1             {v16.16b}, [x0], x1
437     st1             {v0.16b},  [x0], x1
438
439     ret
440 endfunc
441
442 function x264_deblock_h_chroma_neon, export=1
443     h264_loop_filter_start
444
445     sub             x0,  x0,  #4
446 deblock_h_chroma:
447     ld1             {v18.d}[0], [x0], x1
448     ld1             {v16.d}[0], [x0], x1
449     ld1             {v0.d}[0],  [x0], x1
450     ld1             {v2.d}[0],  [x0], x1
451     ld1             {v18.d}[1], [x0], x1
452     ld1             {v16.d}[1], [x0], x1
453     ld1             {v0.d}[1],  [x0], x1
454     ld1             {v2.d}[1],  [x0], x1
455
456     transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
457
458     h264_loop_filter_chroma
459
460     transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
461
462     sub             x0,  x0,  x1, lsl #3
463     st1             {v18.d}[0], [x0], x1
464     st1             {v16.d}[0], [x0], x1
465     st1             {v0.d}[0],  [x0], x1
466     st1             {v2.d}[0],  [x0], x1
467     st1             {v18.d}[1], [x0], x1
468     st1             {v16.d}[1], [x0], x1
469     st1             {v0.d}[1],  [x0], x1
470     st1             {v2.d}[1],  [x0], x1
471
472     ret
473 endfunc
474
475 function x264_deblock_h_chroma_422_neon, export=1
476     add             x5,  x0,  x1
477     sub             x0,  x0,  #4
478     add             x1,  x1,  x1
479     h264_loop_filter_start
480     mov             x7,  x30
481     bl              deblock_h_chroma
482     mov             x30, x7
483     sub             x0,  x5,  #4
484     mov             v24.s[0], w6
485     b               deblock_h_chroma
486 endfunc
487
488 .macro h264_loop_filter_chroma8
489     dup             v22.8b,  w2                 // alpha
490     uxtl            v24.8h,  v24.8b
491     uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
492     uxtl            v4.8h,   v17.8b
493     uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
494     usubw           v4.8h,   v4.8h,   v16.8b
495     sli             v24.8h,  v24.8h,  #8
496     shl             v4.8h,   v4.8h,   #2
497     uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
498     uaddw           v4.8h,   v4.8h,   v18.8b
499     cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
500     usubw           v4.8h,   v4.8h,   v19.8b
501     dup             v22.8b,  w3                 // beta
502     rshrn           v4.8b,   v4.8h,   #3
503     cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
504     cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
505     smin            v4.8b,   v4.8b,   v24.8b
506     neg             v25.8b,  v24.8b
507     and             v26.8b,  v26.8b,  v28.8b
508     smax            v4.8b,   v4.8b,   v25.8b
509     and             v26.8b,  v26.8b,  v30.8b
510     uxtl            v22.8h,  v17.8b
511     and             v4.8b,   v4.8b,   v26.8b
512     uxtl            v28.8h,  v16.8b
513     saddw           v28.8h,  v28.8h,  v4.8b
514     ssubw           v22.8h,  v22.8h,  v4.8b
515     sqxtun          v16.8b,  v28.8h
516     sqxtun          v17.8b,  v22.8h
517 .endm
518
519 function x264_deblock_h_chroma_mbaff_neon, export=1
520     h264_loop_filter_start
521
522     sub             x4,  x0,  #4
523     sub             x0,  x0,  #2
524
525     ld1             {v18.8b}, [x4], x1
526     ld1             {v16.8b}, [x4], x1
527     ld1             {v17.8b},  [x4], x1
528     ld1             {v19.8b},  [x4]
529
530     transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
531
532     h264_loop_filter_chroma8
533
534     st2             {v16.h,v17.h}[0], [x0], x1
535     st2             {v16.h,v17.h}[1], [x0], x1
536     st2             {v16.h,v17.h}[2], [x0], x1
537     st2             {v16.h,v17.h}[3], [x0]
538
539     ret
540 endfunc
541
542 .macro h264_loop_filter_chroma_intra width=16
543     uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
544     uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
545     uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
546     cmhi            v26.16b, v30.16b, v26.16b  // < alpha
547     cmhi            v27.16b, v31.16b, v27.16b  // < beta
548     cmhi            v28.16b, v31.16b, v28.16b  // < beta
549     and             v26.16b, v26.16b, v27.16b
550     and             v26.16b, v26.16b, v28.16b
551
552     ushll           v4.8h,   v18.8b,  #1
553     ushll           v6.8h,   v19.8b,  #1
554 .ifc \width, 16
555     ushll2          v5.8h,   v18.16b, #1
556     ushll2          v7.8h,   v19.16b, #1
557     uaddl2          v21.8h,  v16.16b, v19.16b
558     uaddl2          v23.8h,  v17.16b, v18.16b
559 .endif
560     uaddl           v20.8h,  v16.8b,  v19.8b
561     uaddl           v22.8h,  v17.8b,  v18.8b
562     add             v20.8h,  v20.8h,  v4.8h     // mlal?
563     add             v22.8h,  v22.8h,  v6.8h
564 .ifc \width, 16
565     add             v21.8h,  v21.8h,  v5.8h
566     add             v23.8h,  v23.8h,  v7.8h
567 .endif
568     uqrshrn         v24.8b,  v20.8h,  #2
569     uqrshrn         v25.8b,  v22.8h,  #2
570 .ifc \width, 16
571     uqrshrn2        v24.16b, v21.8h,  #2
572     uqrshrn2        v25.16b, v23.8h,  #2
573 .endif
574     bit             v16.16b, v24.16b, v26.16b
575     bit             v17.16b, v25.16b, v26.16b
576 .endm
577
578 function x264_deblock_v_chroma_intra_neon, export=1
579     h264_loop_filter_start_intra
580
581     sub             x0,  x0,  x1, lsl #1
582     ld1             {v18.16b}, [x0], x1
583     ld1             {v16.16b}, [x0], x1
584     ld1             {v17.16b}, [x0], x1
585     ld1             {v19.16b}, [x0]
586
587     h264_loop_filter_chroma_intra
588
589     sub             x0,  x0,  x1, lsl #1
590     st1             {v16.16b}, [x0], x1
591     st1             {v17.16b}, [x0], x1
592
593     ret
594 endfunc
595
596 function x264_deblock_h_chroma_intra_mbaff_neon, export=1
597     h264_loop_filter_start_intra
598
599     sub             x4,  x0,  #4
600     sub             x0,  x0,  #2
601     ld1             {v18.8b}, [x4], x1
602     ld1             {v16.8b}, [x4], x1
603     ld1             {v17.8b}, [x4], x1
604     ld1             {v19.8b}, [x4], x1
605
606     transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
607
608     h264_loop_filter_chroma_intra width=8
609
610     st2             {v16.h,v17.h}[0], [x0], x1
611     st2             {v16.h,v17.h}[1], [x0], x1
612     st2             {v16.h,v17.h}[2], [x0], x1
613     st2             {v16.h,v17.h}[3], [x0], x1
614
615     ret
616 endfunc
617
618 function x264_deblock_h_chroma_intra_neon, export=1
619     h264_loop_filter_start_intra
620
621     sub             x4,  x0,  #4
622     sub             x0,  x0,  #2
623     ld1             {v18.d}[0], [x4], x1
624     ld1             {v16.d}[0], [x4], x1
625     ld1             {v17.d}[0], [x4], x1
626     ld1             {v19.d}[0], [x4], x1
627     ld1             {v18.d}[1], [x4], x1
628     ld1             {v16.d}[1], [x4], x1
629     ld1             {v17.d}[1], [x4], x1
630     ld1             {v19.d}[1], [x4], x1
631
632     transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
633
634     h264_loop_filter_chroma_intra
635
636     st2             {v16.h,v17.h}[0], [x0], x1
637     st2             {v16.h,v17.h}[1], [x0], x1
638     st2             {v16.h,v17.h}[2], [x0], x1
639     st2             {v16.h,v17.h}[3], [x0], x1
640     st2             {v16.h,v17.h}[4], [x0], x1
641     st2             {v16.h,v17.h}[5], [x0], x1
642     st2             {v16.h,v17.h}[6], [x0], x1
643     st2             {v16.h,v17.h}[7], [x0], x1
644
645     ret
646 endfunc
647
648 function x264_deblock_h_chroma_422_intra_neon, export=1
649     h264_loop_filter_start_intra
650
651     sub             x4,  x0,  #4
652     sub             x0,  x0,  #2
653     ld1             {v18.d}[0], [x4], x1
654     ld1             {v16.d}[0], [x4], x1
655     ld1             {v17.d}[0], [x4], x1
656     ld1             {v19.d}[0], [x4], x1
657     ld1             {v18.d}[1], [x4], x1
658     ld1             {v16.d}[1], [x4], x1
659     ld1             {v17.d}[1], [x4], x1
660     ld1             {v19.d}[1], [x4], x1
661
662     transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
663
664     h264_loop_filter_chroma_intra
665
666     st2             {v16.h,v17.h}[0], [x0], x1
667     st2             {v16.h,v17.h}[1], [x0], x1
668     st2             {v16.h,v17.h}[2], [x0], x1
669     st2             {v16.h,v17.h}[3], [x0], x1
670     st2             {v16.h,v17.h}[4], [x0], x1
671     st2             {v16.h,v17.h}[5], [x0], x1
672     st2             {v16.h,v17.h}[6], [x0], x1
673     st2             {v16.h,v17.h}[7], [x0], x1
674
675     ld1             {v18.d}[0], [x4], x1
676     ld1             {v16.d}[0], [x4], x1
677     ld1             {v17.d}[0], [x4], x1
678     ld1             {v19.d}[0], [x4], x1
679     ld1             {v18.d}[1], [x4], x1
680     ld1             {v16.d}[1], [x4], x1
681     ld1             {v17.d}[1], [x4], x1
682     ld1             {v19.d}[1], [x4], x1
683
684     transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
685
686     h264_loop_filter_chroma_intra
687
688     st2             {v16.h,v17.h}[0], [x0], x1
689     st2             {v16.h,v17.h}[1], [x0], x1
690     st2             {v16.h,v17.h}[2], [x0], x1
691     st2             {v16.h,v17.h}[3], [x0], x1
692     st2             {v16.h,v17.h}[4], [x0], x1
693     st2             {v16.h,v17.h}[5], [x0], x1
694     st2             {v16.h,v17.h}[6], [x0], x1
695     st2             {v16.h,v17.h}[7], [x0], x1
696
697     ret
698 endfunc
699
700 //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
701 //                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],
702 //                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
703 //                                uint8_t bs[2][8][4], int mvy_limit,
704 //                                int bframe )
705 function x264_deblock_strength_neon, export=1
706     movi        v4.16b, #0
707     lsl         w4,  w4,  #8
708     add         x3,  x3,  #32
709     sub         w4,  w4,  #(1<<8)-3
710     movi        v5.16b, #0
711     dup         v6.8h,  w4
712     mov         x6,  #-32
713
714 bframe:
715     // load bytes ref
716     add         x2,  x2,  #16
717     ld1        {v31.d}[1], [x1], #8
718     ld1        {v1.16b}, [x1], #16
719     movi        v0.16b,  #0
720     ld1        {v2.16b}, [x1], #16
721     ext         v3.16b,  v0.16b,  v1.16b,  #15
722     ext         v0.16b,  v0.16b,  v2.16b,  #15
723     unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
724     unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
725     ext         v21.16b, v31.16b, v22.16b, #12
726
727     eor         v0.16b,  v20.16b, v22.16b
728     eor         v1.16b,  v21.16b, v22.16b
729     orr         v4.16b,  v4.16b,  v0.16b
730     orr         v5.16b,  v5.16b,  v1.16b
731
732     ld1        {v21.8h}, [x2], #16      // mv + 0x10
733     ld1        {v19.8h}, [x2], #16      // mv + 0x20
734     ld1        {v22.8h}, [x2], #16      // mv + 0x30
735     ld1        {v18.8h}, [x2], #16      // mv + 0x40
736     ld1        {v23.8h}, [x2], #16      // mv + 0x50
737     ext         v19.16b, v19.16b, v22.16b, #12
738     ext         v18.16b, v18.16b, v23.16b, #12
739     sabd        v0.8h,   v22.8h,  v19.8h
740     ld1        {v19.8h}, [x2], #16      // mv + 0x60
741     sabd        v1.8h,   v23.8h,  v18.8h
742     ld1        {v24.8h}, [x2], #16      // mv + 0x70
743     uqxtn       v0.8b,   v0.8h
744     ld1        {v18.8h}, [x2], #16      // mv + 0x80
745     ld1        {v25.8h}, [x2], #16      // mv + 0x90
746     uqxtn2      v0.16b,  v1.8h
747     ext         v19.16b, v19.16b, v24.16b, #12
748     ext         v18.16b, v18.16b, v25.16b, #12
749     sabd        v1.8h,   v24.8h,  v19.8h
750     sabd        v2.8h,   v25.8h,  v18.8h
751     uqxtn       v1.8b,   v1.8h
752     uqxtn2      v1.16b,  v2.8h
753
754     uqsub       v0.16b,  v0.16b,  v6.16b
755     uqsub       v1.16b,  v1.16b,  v6.16b
756     uqxtn       v0.8b,   v0.8h
757     uqxtn2      v0.16b,  v1.8h
758
759     sabd        v1.8h,   v22.8h,  v23.8h
760     orr         v4.16b,  v4.16b,  v0.16b
761
762     sabd        v0.8h,   v21.8h,  v22.8h
763     sabd        v2.8h,   v23.8h,  v24.8h
764     sabd        v3.8h,   v24.8h,  v25.8h
765     uqxtn       v0.8b,   v0.8h
766     uqxtn2      v0.16b,  v1.8h
767     uqxtn       v1.8b,   v2.8h
768     uqxtn2      v1.16b,  v3.8h
769
770     uqsub       v0.16b,  v0.16b,  v6.16b
771     uqsub       v1.16b,  v1.16b,  v6.16b
772     uqxtn       v0.8b,   v0.8h
773     uqxtn2      v0.16b,  v1.8h
774     subs        w5,  w5,  #1
775     orr         v5.16b,  v5.16b,  v0.16b
776     b.eq        bframe
777
778     movi        v6.16b, #1
779     // load bytes nnz
780     ld1        {v31.d}[1], [x0], #8
781     ld1        {v1.16b}, [x0], #16
782     movi        v0.16b,  #0
783     ld1        {v2.16b}, [x0], #16
784     ext         v3.16b,  v0.16b,  v1.16b,  #15
785     ext         v0.16b,  v0.16b,  v2.16b,  #15
786     unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
787     unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
788     ext         v21.16b, v31.16b, v22.16b, #12
789
790     movrel      x7,  transpose_table
791     ld1        {v7.16b}, [x7]
792     orr         v0.16b,  v20.16b, v22.16b
793     orr         v1.16b,  v21.16b, v22.16b
794     umin        v0.16b,  v0.16b,  v6.16b
795     umin        v1.16b,  v1.16b,  v6.16b
796     umin        v4.16b,  v4.16b,  v6.16b        // mv ? 1 : 0
797     umin        v5.16b,  v5.16b,  v6.16b
798     add         v0.16b,  v0.16b,  v0.16b        // nnz ? 2 : 0
799     add         v1.16b,  v1.16b,  v1.16b
800     umax        v4.16b,  v4.16b,  v0.16b
801     umax        v5.16b,  v5.16b,  v1.16b
802     tbl         v6.16b, {v4.16b}, v7.16b
803     st1        {v5.16b}, [x3], x6       // bs[1]
804     st1        {v6.16b}, [x3]           // bs[0]
805     ret
806 endfunc
807
808 const transpose_table
809     .byte 0, 4,  8, 12
810     .byte 1, 5,  9, 13
811     .byte 2, 6, 10, 14
812     .byte 3, 7, 11, 15
813 endconst