]> git.sesse.net Git - x264/blob - common/aarch64/deblock-a.S
e051a6a142fa8fe7e7f9738a1331c4266ecec600
[x264] / common / aarch64 / deblock-a.S
1 /*****************************************************************************
2  * deblock.S: aarch64 deblocking
3  *****************************************************************************
4  * Copyright (C) 2009-2015 x264 project
5  *
6  * Authors: Mans Rullgard <mans@mansr.com>
7  *          Janne Grunau <janne-x264@jannau.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26
27 #include "asm.S"
28
29 .macro h264_loop_filter_start
30     cmp             w2,  #0
31     ldr             w6,  [x4]
32     ccmp            w3,  #0, #0, ne
33     mov             v24.s[0], w6
34     and             w6,  w6,  w6,  lsl #16
35     b.eq            1f
36     ands            w6,  w6,  w6,  lsl #8
37     b.ge            2f
38 1:
39     ret
40 2:
41 .endm
42
43 .macro h264_loop_filter_luma
44     dup             v22.16b, w2                     // alpha
45     uxtl            v24.8h,  v24.8b
46     uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
47     uxtl            v24.4s,  v24.4h
48     uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
49     sli             v24.8h,  v24.8h,  #8
50     uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
51     sli             v24.4s,  v24.4s,  #16
52     cmhi            v21.16b, v22.16b, v21.16b       // < alpha
53     dup             v22.16b, w3                     // beta
54     cmlt            v23.16b, v24.16b, #0
55     cmhi            v28.16b, v22.16b, v28.16b       // < beta
56     cmhi            v30.16b, v22.16b, v30.16b       // < beta
57     bic             v21.16b, v21.16b, v23.16b
58     uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
59     and             v21.16b, v21.16b, v28.16b
60     uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
61     cmhi            v17.16b, v22.16b, v17.16b       // < beta
62     and             v21.16b, v21.16b, v30.16b
63     cmhi            v19.16b, v22.16b, v19.16b       // < beta
64     and             v17.16b, v17.16b, v21.16b
65     and             v19.16b, v19.16b, v21.16b
66     and             v24.16b, v24.16b, v21.16b
67     urhadd          v28.16b, v16.16b,  v0.16b
68     sub             v21.16b, v24.16b, v17.16b
69     uqadd           v23.16b, v18.16b, v24.16b
70     uhadd           v20.16b, v20.16b, v28.16b
71     sub             v21.16b, v21.16b, v19.16b
72     uhadd           v28.16b,  v4.16b, v28.16b
73     umin            v23.16b, v23.16b, v20.16b
74     uqsub           v22.16b, v18.16b, v24.16b
75     uqadd           v4.16b,   v2.16b, v24.16b
76     umax            v23.16b, v23.16b, v22.16b
77     uqsub           v22.16b,  v2.16b, v24.16b
78     umin            v28.16b,  v4.16b, v28.16b
79     uxtl            v4.8h,    v0.8b
80     umax            v28.16b, v28.16b, v22.16b
81     uxtl2           v20.8h,   v0.16b
82     usubw           v4.8h,    v4.8h,  v16.8b
83     usubw2          v20.8h,  v20.8h,  v16.16b
84     shl             v4.8h,    v4.8h,  #2
85     shl             v20.8h,  v20.8h,  #2
86     uaddw           v4.8h,    v4.8h,  v18.8b
87     uaddw2          v20.8h,  v20.8h,  v18.16b
88     usubw           v4.8h,    v4.8h,   v2.8b
89     usubw2          v20.8h,  v20.8h,   v2.16b
90     rshrn           v4.8b,    v4.8h,  #3
91     rshrn2          v4.16b,  v20.8h,  #3
92     bsl             v17.16b, v23.16b, v18.16b
93     bsl             v19.16b, v28.16b,  v2.16b
94     neg             v23.16b, v21.16b
95     uxtl            v28.8h,  v16.8b
96     smin            v4.16b,   v4.16b, v21.16b
97     uxtl2           v21.8h,  v16.16b
98     smax            v4.16b,   v4.16b, v23.16b
99     uxtl            v22.8h,   v0.8b
100     uxtl2           v24.8h,   v0.16b
101     saddw           v28.8h,  v28.8h,  v4.8b
102     saddw2          v21.8h,  v21.8h,  v4.16b
103     ssubw           v22.8h,  v22.8h,  v4.8b
104     ssubw2          v24.8h,  v24.8h,  v4.16b
105     sqxtun          v16.8b,  v28.8h
106     sqxtun2         v16.16b, v21.8h
107     sqxtun          v0.8b,   v22.8h
108     sqxtun2         v0.16b,  v24.8h
109 .endm
110
111 function x264_deblock_v_luma_neon, export=1
112     h264_loop_filter_start
113
114     ld1             {v0.16b},  [x0], x1
115     ld1             {v2.16b},  [x0], x1
116     ld1             {v4.16b},  [x0], x1
117     sub             x0,  x0,  x1, lsl #2
118     sub             x0,  x0,  x1, lsl #1
119     ld1             {v20.16b},  [x0], x1
120     ld1             {v18.16b},  [x0], x1
121     ld1             {v16.16b},  [x0], x1
122
123     h264_loop_filter_luma
124
125     sub             x0,  x0,  x1, lsl #1
126     st1             {v17.16b}, [x0], x1
127     st1             {v16.16b}, [x0], x1
128     st1             {v0.16b},  [x0], x1
129     st1             {v19.16b}, [x0]
130
131     ret
132 endfunc
133
134 function x264_deblock_h_luma_neon, export=1
135     h264_loop_filter_start
136
137     sub             x0,  x0,  #4
138     ld1             {v6.8b},  [x0], x1
139     ld1             {v20.8b}, [x0], x1
140     ld1             {v18.8b}, [x0], x1
141     ld1             {v16.8b}, [x0], x1
142     ld1             {v0.8b},  [x0], x1
143     ld1             {v2.8b},  [x0], x1
144     ld1             {v4.8b},  [x0], x1
145     ld1             {v26.8b}, [x0], x1
146     ld1             {v6.d}[1],  [x0], x1
147     ld1             {v20.d}[1], [x0], x1
148     ld1             {v18.d}[1], [x0], x1
149     ld1             {v16.d}[1], [x0], x1
150     ld1             {v0.d}[1],  [x0], x1
151     ld1             {v2.d}[1],  [x0], x1
152     ld1             {v4.d}[1],  [x0], x1
153     ld1             {v26.d}[1], [x0], x1
154
155     transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
156
157     h264_loop_filter_luma
158
159     transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
160
161     sub             x0,  x0,  x1, lsl #4
162     add             x0,  x0,  #2
163     st1             {v17.s}[0],  [x0], x1
164     st1             {v16.s}[0], [x0], x1
165     st1             {v0.s}[0],  [x0], x1
166     st1             {v19.s}[0], [x0], x1
167     st1             {v17.s}[1],  [x0], x1
168     st1             {v16.s}[1], [x0], x1
169     st1             {v0.s}[1],  [x0], x1
170     st1             {v19.s}[1], [x0], x1
171     st1             {v17.s}[2],  [x0], x1
172     st1             {v16.s}[2], [x0], x1
173     st1             {v0.s}[2],  [x0], x1
174     st1             {v19.s}[2], [x0], x1
175     st1             {v17.s}[3],  [x0], x1
176     st1             {v16.s}[3], [x0], x1
177     st1             {v0.s}[3],  [x0], x1
178     st1             {v19.s}[3], [x0], x1
179
180     ret
181 endfunc
182
183 .macro h264_loop_filter_start_intra
184     orr             w4,  w2,  w3
185     cmp             w4,  #0
186     b.ne            1f
187     ret
188 1:
189     dup             v30.16b, w2                // alpha
190     dup             v31.16b, w3                // beta
191 .endm
192
193 .macro h264_loop_filter_luma_intra
194     uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
195     uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
196     uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
197     cmhi            v19.16b, v30.16b, v16.16b       // < alpha
198     cmhi            v17.16b, v31.16b, v17.16b       // < beta
199     cmhi            v18.16b, v31.16b, v18.16b       // < beta
200
201     movi            v29.16b, #2
202     ushr            v30.16b, v30.16b, #2            // alpha >> 2
203     add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
204     cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
205
206     and             v19.16b, v19.16b, v17.16b
207     and             v19.16b, v19.16b, v18.16b
208     shrn            v20.8b,  v19.8h,  #4
209     mov             x4, v20.d[0]
210     cbz             x4, 9f
211
212     ushll           v20.8h,  v6.8b,   #1
213     ushll           v22.8h,  v1.8b,   #1
214     ushll2          v21.8h,  v6.16b,  #1
215     ushll2          v23.8h,  v1.16b,  #1
216     uaddw           v20.8h,  v20.8h,  v7.8b
217     uaddw           v22.8h,  v22.8h,  v0.8b
218     uaddw2          v21.8h,  v21.8h,  v7.16b
219     uaddw2          v23.8h,  v23.8h,  v0.16b
220     uaddw           v20.8h,  v20.8h,  v1.8b
221     uaddw           v22.8h,  v22.8h,  v6.8b
222     uaddw2          v21.8h,  v21.8h,  v1.16b
223     uaddw2          v23.8h,  v23.8h,  v6.16b
224
225     rshrn           v24.8b,  v20.8h,  #2 // p0'_1
226     rshrn           v25.8b,  v22.8h,  #2 // q0'_1
227     rshrn2          v24.16b, v21.8h,  #2 // p0'_1
228     rshrn2          v25.16b, v23.8h,  #2 // q0'_1
229
230     uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
231     uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
232     cmhi            v17.16b, v31.16b, v17.16b       // < beta
233     cmhi            v18.16b, v31.16b, v18.16b       // < beta
234
235     and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
236     and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
237
238     not             v30.16b, v17.16b
239     not             v31.16b, v18.16b
240
241     and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
242     and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
243
244     and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
245     and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
246
247     //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248     uaddl           v26.8h,  v5.8b,   v7.8b
249     uaddl2          v27.8h,  v5.16b,  v7.16b
250     uaddw           v26.8h,  v26.8h,  v0.8b
251     uaddw2          v27.8h,  v27.8h,  v0.16b
252     add             v20.8h,  v20.8h,  v26.8h
253     add             v21.8h,  v21.8h,  v27.8h
254     uaddw           v20.8h,  v20.8h,  v0.8b
255     uaddw2          v21.8h,  v21.8h,  v0.16b
256     rshrn           v20.8b,  v20.8h,  #3 // p0'_2
257     rshrn2          v20.16b, v21.8h,  #3 // p0'_2
258     uaddw           v26.8h,  v26.8h,  v6.8b
259     uaddw2          v27.8h,  v27.8h,  v6.16b
260     rshrn           v21.8b,  v26.8h,  #2 // p1'_2
261     rshrn2          v21.16b, v27.8h,  #2 // p1'_2
262     uaddl           v28.8h,  v4.8b,   v5.8b
263     uaddl2          v29.8h,  v4.16b,  v5.16b
264     shl             v28.8h,  v28.8h,  #1
265     shl             v29.8h,  v29.8h,  #1
266     add             v28.8h,  v28.8h,  v26.8h
267     add             v29.8h,  v29.8h,  v27.8h
268     rshrn           v19.8b,  v28.8h,  #3 // p2'_2
269     rshrn2          v19.16b, v29.8h,  #3 // p2'_2
270
271     //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272     uaddl           v26.8h,  v2.8b,   v0.8b
273     uaddl2          v27.8h,  v2.16b,  v0.16b
274     uaddw           v26.8h,  v26.8h,  v7.8b
275     uaddw2          v27.8h,  v27.8h,  v7.16b
276     add             v22.8h,  v22.8h,  v26.8h
277     add             v23.8h,  v23.8h,  v27.8h
278     uaddw           v22.8h,  v22.8h,  v7.8b
279     uaddw2          v23.8h,  v23.8h,  v7.16b
280     rshrn           v22.8b,  v22.8h,  #3 // q0'_2
281     rshrn2          v22.16b, v23.8h,  #3 // q0'_2
282     uaddw           v26.8h,  v26.8h,  v1.8b
283     uaddw2          v27.8h,  v27.8h,  v1.16b
284     rshrn           v23.8b,  v26.8h,  #2 // q1'_2
285     rshrn2          v23.16b, v27.8h,  #2 // q1'_2
286     uaddl           v28.8h,  v2.8b,   v3.8b
287     uaddl2          v29.8h,  v2.16b,  v3.16b
288     shl             v28.8h,  v28.8h,  #1
289     shl             v29.8h,  v29.8h,  #1
290     add             v28.8h,  v28.8h,  v26.8h
291     add             v29.8h,  v29.8h,  v27.8h
292     rshrn           v26.8b,  v28.8h,  #3 // q2'_2
293     rshrn2          v26.16b, v29.8h,  #3 // q2'_2
294
295     bit             v7.16b,  v24.16b, v30.16b  // p0'_1
296     bit             v0.16b,  v25.16b, v31.16b  // q0'_1
297     bit             v7.16b, v20.16b,  v17.16b  // p0'_2
298     bit             v6.16b, v21.16b,  v17.16b  // p1'_2
299     bit             v5.16b, v19.16b,  v17.16b  // p2'_2
300     bit             v0.16b, v22.16b,  v18.16b  // q0'_2
301     bit             v1.16b, v23.16b,  v18.16b  // q1'_2
302     bit             v2.16b, v26.16b,  v18.16b  // q2'_2
303 .endm
304
305 function x264_deblock_v_luma_intra_neon, export=1
306     h264_loop_filter_start_intra
307
308     ld1             {v0.16b},  [x0], x1 // q0
309     ld1             {v1.16b},  [x0], x1 // q1
310     ld1             {v2.16b},  [x0], x1 // q2
311     ld1             {v3.16b},  [x0], x1 // q3
312     sub             x0,  x0,  x1, lsl #3
313     ld1             {v4.16b},  [x0], x1 // p3
314     ld1             {v5.16b},  [x0], x1 // p2
315     ld1             {v6.16b},  [x0], x1 // p1
316     ld1             {v7.16b},  [x0]     // p0
317
318     h264_loop_filter_luma_intra
319
320     sub             x0,  x0,  x1, lsl #1
321     st1             {v5.16b}, [x0], x1  // p2
322     st1             {v6.16b}, [x0], x1  // p1
323     st1             {v7.16b}, [x0], x1  // p0
324     st1             {v0.16b}, [x0], x1  // q0
325     st1             {v1.16b}, [x0], x1  // q1
326     st1             {v2.16b}, [x0]      // q2
327 9:
328     ret
329 endfunc
330
331 function x264_deblock_h_luma_intra_neon, export=1
332     h264_loop_filter_start_intra
333
334     sub             x0,  x0,  #4
335     ld1             {v4.8b},  [x0], x1
336     ld1             {v5.8b},  [x0], x1
337     ld1             {v6.8b},  [x0], x1
338     ld1             {v7.8b},  [x0], x1
339     ld1             {v0.8b},  [x0], x1
340     ld1             {v1.8b},  [x0], x1
341     ld1             {v2.8b},  [x0], x1
342     ld1             {v3.8b},  [x0], x1
343     ld1             {v4.d}[1],  [x0], x1
344     ld1             {v5.d}[1],  [x0], x1
345     ld1             {v6.d}[1],  [x0], x1
346     ld1             {v7.d}[1],  [x0], x1
347     ld1             {v0.d}[1],  [x0], x1
348     ld1             {v1.d}[1],  [x0], x1
349     ld1             {v2.d}[1],  [x0], x1
350     ld1             {v3.d}[1],  [x0], x1
351
352     transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
353
354     h264_loop_filter_luma_intra
355
356     transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357
358     sub             x0,  x0,  x1, lsl #4
359     st1             {v4.8b},  [x0], x1
360     st1             {v5.8b},  [x0], x1
361     st1             {v6.8b},  [x0], x1
362     st1             {v7.8b},  [x0], x1
363     st1             {v0.8b},  [x0], x1
364     st1             {v1.8b},  [x0], x1
365     st1             {v2.8b},  [x0], x1
366     st1             {v3.8b},  [x0], x1
367     st1             {v4.d}[1],  [x0], x1
368     st1             {v5.d}[1],  [x0], x1
369     st1             {v6.d}[1],  [x0], x1
370     st1             {v7.d}[1],  [x0], x1
371     st1             {v0.d}[1],  [x0], x1
372     st1             {v1.d}[1],  [x0], x1
373     st1             {v2.d}[1],  [x0], x1
374     st1             {v3.d}[1],  [x0], x1
375 9:
376     ret
377 endfunc
378
379 .macro h264_loop_filter_chroma
380     dup             v22.16b, w2              // alpha
381     uxtl            v24.8h,  v24.8b
382     uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
383     uxtl            v4.8h,   v0.8b
384     uxtl2           v5.8h,   v0.16b
385     uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
386     usubw           v4.8h,   v4.8h,   v16.8b
387     usubw2          v5.8h,   v5.8h,   v16.16b
388     sli             v24.8h,  v24.8h,  #8
389     shl             v4.8h,   v4.8h,   #2
390     shl             v5.8h,   v5.8h,   #2
391     uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
392     uxtl            v24.4s,  v24.4h
393     uaddw           v4.8h,   v4.8h,   v18.8b
394     uaddw2          v5.8h,   v5.8h,   v18.16b
395     cmhi            v26.16b, v22.16b, v26.16b  // < alpha
396     usubw           v4.8h,   v4.8h,   v2.8b
397     usubw2          v5.8h,   v5.8h,   v2.16b
398     sli             v24.4s,  v24.4s,  #16
399     dup             v22.16b, w3              // beta
400     rshrn           v4.8b,   v4.8h,   #3
401     rshrn2          v4.16b,  v5.8h,   #3
402     cmhi            v28.16b, v22.16b, v28.16b  // < beta
403     cmhi            v30.16b, v22.16b, v30.16b  // < beta
404     smin            v4.16b,  v4.16b,  v24.16b
405     neg             v25.16b, v24.16b
406     and             v26.16b, v26.16b, v28.16b
407     smax            v4.16b,  v4.16b,  v25.16b
408     and             v26.16b, v26.16b, v30.16b
409     uxtl            v22.8h,  v0.8b
410     uxtl2           v23.8h,  v0.16b
411     and             v4.16b,  v4.16b,  v26.16b
412     uxtl            v28.8h,  v16.8b
413     uxtl2           v29.8h,  v16.16b
414     saddw           v28.8h,  v28.8h,  v4.8b
415     saddw2          v29.8h,  v29.8h,  v4.16b
416     ssubw           v22.8h,  v22.8h,  v4.8b
417     ssubw2          v23.8h,  v23.8h,  v4.16b
418     sqxtun          v16.8b,  v28.8h
419     sqxtun          v0.8b,   v22.8h
420     sqxtun2         v16.16b, v29.8h
421     sqxtun2         v0.16b,  v23.8h
422 .endm
423
424 function x264_deblock_v_chroma_neon, export=1
425     h264_loop_filter_start
426
427     sub             x0,  x0,  x1, lsl #1
428     ld1             {v18.16b}, [x0], x1
429     ld1             {v16.16b}, [x0], x1
430     ld1             {v0.16b},  [x0], x1
431     ld1             {v2.16b},  [x0]
432
433     h264_loop_filter_chroma
434
435     sub             x0,  x0,  x1, lsl #1
436     st1             {v16.16b}, [x0], x1
437     st1             {v0.16b},  [x0], x1
438
439     ret
440 endfunc
441
442 function x264_deblock_h_chroma_neon, export=1
443     h264_loop_filter_start
444
445     sub             x0,  x0,  #4
446 deblock_h_chroma:
447     ld1             {v18.d}[0], [x0], x1
448     ld1             {v16.d}[0], [x0], x1
449     ld1             {v0.d}[0],  [x0], x1
450     ld1             {v2.d}[0],  [x0], x1
451     ld1             {v18.d}[1], [x0], x1
452     ld1             {v16.d}[1], [x0], x1
453     ld1             {v0.d}[1],  [x0], x1
454     ld1             {v2.d}[1],  [x0], x1
455
456     transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
457
458     h264_loop_filter_chroma
459
460     transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
461
462     sub             x0,  x0,  x1, lsl #3
463     st1             {v18.d}[0], [x0], x1
464     st1             {v16.d}[0], [x0], x1
465     st1             {v0.d}[0],  [x0], x1
466     st1             {v2.d}[0],  [x0], x1
467     st1             {v18.d}[1], [x0], x1
468     st1             {v16.d}[1], [x0], x1
469     st1             {v0.d}[1],  [x0], x1
470     st1             {v2.d}[1],  [x0], x1
471
472     ret
473 endfunc
474
475 function x264_deblock_h_chroma_422_neon, export=1
476     add             x5,  x0,  x1
477     add             x1,  x1,  x1
478     mov             x7,  x30
479     bl              X(x264_deblock_h_chroma_neon)
480     ldr             w6,  [x4]
481     mov             x30, x7
482     sub             x0,  x5,  #4
483     mov             v24.s[0], w6
484     b               deblock_h_chroma
485 endfunc
486
487 .macro h264_loop_filter_chroma8
488     dup             v22.8b,  w2                 // alpha
489     uxtl            v24.8h,  v24.8b
490     uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
491     uxtl            v4.8h,   v17.8b
492     uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
493     usubw           v4.8h,   v4.8h,   v16.8b
494     sli             v24.8h,  v24.8h,  #8
495     shl             v4.8h,   v4.8h,   #2
496     uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
497     uaddw           v4.8h,   v4.8h,   v18.8b
498     cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
499     usubw           v4.8h,   v4.8h,   v19.8b
500     dup             v22.8b,  w3                 // beta
501     rshrn           v4.8b,   v4.8h,   #3
502     cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
503     cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
504     smin            v4.8b,   v4.8b,   v24.8b
505     neg             v25.8b,  v24.8b
506     and             v26.8b,  v26.8b,  v28.8b
507     smax            v4.8b,   v4.8b,   v25.8b
508     and             v26.8b,  v26.8b,  v30.8b
509     uxtl            v22.8h,  v17.8b
510     and             v4.8b,   v4.8b,   v26.8b
511     uxtl            v28.8h,  v16.8b
512     saddw           v28.8h,  v28.8h,  v4.8b
513     ssubw           v22.8h,  v22.8h,  v4.8b
514     sqxtun          v16.8b,  v28.8h
515     sqxtun          v17.8b,  v22.8h
516 .endm
517
518 function x264_deblock_h_chroma_mbaff_neon, export=1
519     h264_loop_filter_start
520
521     sub             x4,  x0,  #4
522     sub             x0,  x0,  #2
523
524     ld1             {v18.8b}, [x4], x1
525     ld1             {v16.8b}, [x4], x1
526     ld1             {v17.8b},  [x4], x1
527     ld1             {v19.8b},  [x4]
528
529     transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
530
531     h264_loop_filter_chroma8
532
533     st2             {v16.h,v17.h}[0], [x0], x1
534     st2             {v16.h,v17.h}[1], [x0], x1
535     st2             {v16.h,v17.h}[2], [x0], x1
536     st2             {v16.h,v17.h}[3], [x0]
537
538     ret
539 endfunc
540
541 .macro h264_loop_filter_chroma_intra width=16
542     uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
543     uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
544     uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
545     cmhi            v26.16b, v30.16b, v26.16b  // < alpha
546     cmhi            v27.16b, v31.16b, v27.16b  // < beta
547     cmhi            v28.16b, v31.16b, v28.16b  // < beta
548     and             v26.16b, v26.16b, v27.16b
549     and             v26.16b, v26.16b, v28.16b
550
551     ushll           v4.8h,   v18.8b,  #1
552     ushll           v6.8h,   v19.8b,  #1
553 .ifc \width, 16
554     ushll2          v5.8h,   v18.16b, #1
555     ushll2          v7.8h,   v19.16b, #1
556     uaddl2          v21.8h,  v16.16b, v19.16b
557     uaddl2          v23.8h,  v17.16b, v18.16b
558 .endif
559     uaddl           v20.8h,  v16.8b,  v19.8b
560     uaddl           v22.8h,  v17.8b,  v18.8b
561     add             v20.8h,  v20.8h,  v4.8h     // mlal?
562     add             v22.8h,  v22.8h,  v6.8h
563 .ifc \width, 16
564     add             v21.8h,  v21.8h,  v5.8h
565     add             v23.8h,  v23.8h,  v7.8h
566 .endif
567     uqrshrn         v24.8b,  v20.8h,  #2
568     uqrshrn         v25.8b,  v22.8h,  #2
569 .ifc \width, 16
570     uqrshrn2        v24.16b, v21.8h,  #2
571     uqrshrn2        v25.16b, v23.8h,  #2
572 .endif
573     bit             v16.16b, v24.16b, v26.16b
574     bit             v17.16b, v25.16b, v26.16b
575 .endm
576
577 function x264_deblock_v_chroma_intra_neon, export=1
578     h264_loop_filter_start_intra
579
580     sub             x0,  x0,  x1, lsl #1
581     ld1             {v18.16b}, [x0], x1
582     ld1             {v16.16b}, [x0], x1
583     ld1             {v17.16b}, [x0], x1
584     ld1             {v19.16b}, [x0]
585
586     h264_loop_filter_chroma_intra
587
588     sub             x0,  x0,  x1, lsl #1
589     st1             {v16.16b}, [x0], x1
590     st1             {v17.16b}, [x0], x1
591
592     ret
593 endfunc
594
595 function x264_deblock_h_chroma_intra_mbaff_neon, export=1
596     h264_loop_filter_start_intra
597
598     sub             x4,  x0,  #4
599     sub             x0,  x0,  #2
600     ld1             {v18.8b}, [x4], x1
601     ld1             {v16.8b}, [x4], x1
602     ld1             {v17.8b}, [x4], x1
603     ld1             {v19.8b}, [x4], x1
604
605     transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
606
607     h264_loop_filter_chroma_intra width=8
608
609     st2             {v16.h,v17.h}[0], [x0], x1
610     st2             {v16.h,v17.h}[1], [x0], x1
611     st2             {v16.h,v17.h}[2], [x0], x1
612     st2             {v16.h,v17.h}[3], [x0], x1
613
614     ret
615 endfunc
616
617 function x264_deblock_h_chroma_intra_neon, export=1
618     h264_loop_filter_start_intra
619
620     sub             x4,  x0,  #4
621     sub             x0,  x0,  #2
622     ld1             {v18.d}[0], [x4], x1
623     ld1             {v16.d}[0], [x4], x1
624     ld1             {v17.d}[0], [x4], x1
625     ld1             {v19.d}[0], [x4], x1
626     ld1             {v18.d}[1], [x4], x1
627     ld1             {v16.d}[1], [x4], x1
628     ld1             {v17.d}[1], [x4], x1
629     ld1             {v19.d}[1], [x4], x1
630
631     transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
632
633     h264_loop_filter_chroma_intra
634
635     st2             {v16.h,v17.h}[0], [x0], x1
636     st2             {v16.h,v17.h}[1], [x0], x1
637     st2             {v16.h,v17.h}[2], [x0], x1
638     st2             {v16.h,v17.h}[3], [x0], x1
639     st2             {v16.h,v17.h}[4], [x0], x1
640     st2             {v16.h,v17.h}[5], [x0], x1
641     st2             {v16.h,v17.h}[6], [x0], x1
642     st2             {v16.h,v17.h}[7], [x0], x1
643
644     ret
645 endfunc
646
647 function x264_deblock_h_chroma_422_intra_neon, export=1
648     h264_loop_filter_start_intra
649
650     sub             x4,  x0,  #4
651     sub             x0,  x0,  #2
652     ld1             {v18.d}[0], [x4], x1
653     ld1             {v16.d}[0], [x4], x1
654     ld1             {v17.d}[0], [x4], x1
655     ld1             {v19.d}[0], [x4], x1
656     ld1             {v18.d}[1], [x4], x1
657     ld1             {v16.d}[1], [x4], x1
658     ld1             {v17.d}[1], [x4], x1
659     ld1             {v19.d}[1], [x4], x1
660
661     transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
662
663     h264_loop_filter_chroma_intra
664
665     st2             {v16.h,v17.h}[0], [x0], x1
666     st2             {v16.h,v17.h}[1], [x0], x1
667     st2             {v16.h,v17.h}[2], [x0], x1
668     st2             {v16.h,v17.h}[3], [x0], x1
669     st2             {v16.h,v17.h}[4], [x0], x1
670     st2             {v16.h,v17.h}[5], [x0], x1
671     st2             {v16.h,v17.h}[6], [x0], x1
672     st2             {v16.h,v17.h}[7], [x0], x1
673
674     ld1             {v18.d}[0], [x4], x1
675     ld1             {v16.d}[0], [x4], x1
676     ld1             {v17.d}[0], [x4], x1
677     ld1             {v19.d}[0], [x4], x1
678     ld1             {v18.d}[1], [x4], x1
679     ld1             {v16.d}[1], [x4], x1
680     ld1             {v17.d}[1], [x4], x1
681     ld1             {v19.d}[1], [x4], x1
682
683     transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
684
685     h264_loop_filter_chroma_intra
686
687     st2             {v16.h,v17.h}[0], [x0], x1
688     st2             {v16.h,v17.h}[1], [x0], x1
689     st2             {v16.h,v17.h}[2], [x0], x1
690     st2             {v16.h,v17.h}[3], [x0], x1
691     st2             {v16.h,v17.h}[4], [x0], x1
692     st2             {v16.h,v17.h}[5], [x0], x1
693     st2             {v16.h,v17.h}[6], [x0], x1
694     st2             {v16.h,v17.h}[7], [x0], x1
695
696     ret
697 endfunc
698
699 //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
700 //                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],
701 //                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
702 //                                uint8_t bs[2][8][4], int mvy_limit,
703 //                                int bframe )
704 function x264_deblock_strength_neon, export=1
705     movi        v4.16b, #0
706     lsl         w4,  w4,  #8
707     add         x3,  x3,  #32
708     sub         w4,  w4,  #(1<<8)-3
709     movi        v5.16b, #0
710     dup         v6.8h,  w4
711     mov         x6,  #-32
712
713 bframe:
714     // load bytes ref
715     add         x2,  x2,  #16
716     ld1        {v31.d}[1], [x1], #8
717     ld1        {v1.16b}, [x1], #16
718     movi        v0.16b,  #0
719     ld1        {v2.16b}, [x1], #16
720     ext         v3.16b,  v0.16b,  v1.16b,  #15
721     ext         v0.16b,  v0.16b,  v2.16b,  #15
722     unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
723     unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
724     ext         v21.16b, v31.16b, v22.16b, #12
725
726     eor         v0.16b,  v20.16b, v22.16b
727     eor         v1.16b,  v21.16b, v22.16b
728     orr         v4.16b,  v4.16b,  v0.16b
729     orr         v5.16b,  v5.16b,  v1.16b
730
731     ld1        {v21.8h}, [x2], #16      // mv + 0x10
732     ld1        {v19.8h}, [x2], #16      // mv + 0x20
733     ld1        {v22.8h}, [x2], #16      // mv + 0x30
734     ld1        {v18.8h}, [x2], #16      // mv + 0x40
735     ld1        {v23.8h}, [x2], #16      // mv + 0x50
736     ext         v19.16b, v19.16b, v22.16b, #12
737     ext         v18.16b, v18.16b, v23.16b, #12
738     sabd        v0.8h,   v22.8h,  v19.8h
739     ld1        {v19.8h}, [x2], #16      // mv + 0x60
740     sabd        v1.8h,   v23.8h,  v18.8h
741     ld1        {v24.8h}, [x2], #16      // mv + 0x70
742     uqxtn       v0.8b,   v0.8h
743     ld1        {v18.8h}, [x2], #16      // mv + 0x80
744     ld1        {v25.8h}, [x2], #16      // mv + 0x90
745     uqxtn2      v0.16b,  v1.8h
746     ext         v19.16b, v19.16b, v24.16b, #12
747     ext         v18.16b, v18.16b, v25.16b, #12
748     sabd        v1.8h,   v24.8h,  v19.8h
749     sabd        v2.8h,   v25.8h,  v18.8h
750     uqxtn       v1.8b,   v1.8h
751     uqxtn2      v1.16b,  v2.8h
752
753     uqsub       v0.16b,  v0.16b,  v6.16b
754     uqsub       v1.16b,  v1.16b,  v6.16b
755     uqxtn       v0.8b,   v0.8h
756     uqxtn2      v0.16b,  v1.8h
757
758     sabd        v1.8h,   v22.8h,  v23.8h
759     orr         v4.16b,  v4.16b,  v0.16b
760
761     sabd        v0.8h,   v21.8h,  v22.8h
762     sabd        v2.8h,   v23.8h,  v24.8h
763     sabd        v3.8h,   v24.8h,  v25.8h
764     uqxtn       v0.8b,   v0.8h
765     uqxtn2      v0.16b,  v1.8h
766     uqxtn       v1.8b,   v2.8h
767     uqxtn2      v1.16b,  v3.8h
768
769     uqsub       v0.16b,  v0.16b,  v6.16b
770     uqsub       v1.16b,  v1.16b,  v6.16b
771     uqxtn       v0.8b,   v0.8h
772     uqxtn2      v0.16b,  v1.8h
773     subs        w5,  w5,  #1
774     orr         v5.16b,  v5.16b,  v0.16b
775     b.eq        bframe
776
777     movi        v6.16b, #1
778     // load bytes nnz
779     ld1        {v31.d}[1], [x0], #8
780     ld1        {v1.16b}, [x0], #16
781     movi        v0.16b,  #0
782     ld1        {v2.16b}, [x0], #16
783     ext         v3.16b,  v0.16b,  v1.16b,  #15
784     ext         v0.16b,  v0.16b,  v2.16b,  #15
785     unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
786     unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
787     ext         v21.16b, v31.16b, v22.16b, #12
788
789     movrel      x7,  transpose_table
790     ld1        {v7.16b}, [x7]
791     orr         v0.16b,  v20.16b, v22.16b
792     orr         v1.16b,  v21.16b, v22.16b
793     umin        v0.16b,  v0.16b,  v6.16b
794     umin        v1.16b,  v1.16b,  v6.16b
795     umin        v4.16b,  v4.16b,  v6.16b        // mv ? 1 : 0
796     umin        v5.16b,  v5.16b,  v6.16b
797     add         v0.16b,  v0.16b,  v0.16b        // nnz ? 2 : 0
798     add         v1.16b,  v1.16b,  v1.16b
799     umax        v4.16b,  v4.16b,  v0.16b
800     umax        v5.16b,  v5.16b,  v1.16b
801     tbl         v6.16b, {v4.16b}, v7.16b
802     st1        {v5.16b}, [x3], x6       // bs[1]
803     st1        {v6.16b}, [x3]           // bs[0]
804     ret
805 endfunc
806
807 const transpose_table
808     .byte 0, 4,  8, 12
809     .byte 1, 5,  9, 13
810     .byte 2, 6, 10, 14
811     .byte 3, 7, 11, 15
812 endconst