1 /*****************************************************************************
2 * deblock.S: aarch64 deblocking
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: Mans Rullgard <mans@mansr.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
29 .macro h264_loop_filter_start
34 and w6, w6, w6, lsl #16
36 ands w6, w6, w6, lsl #8
43 .macro h264_loop_filter_luma
44 dup v22.16b, w2 // alpha
46 uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
48 uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
49 sli v24.8h, v24.8h, #8
50 uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
51 sli v24.4s, v24.4s, #16
52 cmhi v21.16b, v22.16b, v21.16b // < alpha
53 dup v22.16b, w3 // beta
54 cmlt v23.16b, v24.16b, #0
55 cmhi v28.16b, v22.16b, v28.16b // < beta
56 cmhi v30.16b, v22.16b, v30.16b // < beta
57 bic v21.16b, v21.16b, v23.16b
58 uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
59 and v21.16b, v21.16b, v28.16b
60 uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
61 cmhi v17.16b, v22.16b, v17.16b // < beta
62 and v21.16b, v21.16b, v30.16b
63 cmhi v19.16b, v22.16b, v19.16b // < beta
64 and v17.16b, v17.16b, v21.16b
65 and v19.16b, v19.16b, v21.16b
66 and v24.16b, v24.16b, v21.16b
67 urhadd v28.16b, v16.16b, v0.16b
68 sub v21.16b, v24.16b, v17.16b
69 uqadd v23.16b, v18.16b, v24.16b
70 uhadd v20.16b, v20.16b, v28.16b
71 sub v21.16b, v21.16b, v19.16b
72 uhadd v28.16b, v4.16b, v28.16b
73 umin v23.16b, v23.16b, v20.16b
74 uqsub v22.16b, v18.16b, v24.16b
75 uqadd v4.16b, v2.16b, v24.16b
76 umax v23.16b, v23.16b, v22.16b
77 uqsub v22.16b, v2.16b, v24.16b
78 umin v28.16b, v4.16b, v28.16b
80 umax v28.16b, v28.16b, v22.16b
82 usubw v4.8h, v4.8h, v16.8b
83 usubw2 v20.8h, v20.8h, v16.16b
85 shl v20.8h, v20.8h, #2
86 uaddw v4.8h, v4.8h, v18.8b
87 uaddw2 v20.8h, v20.8h, v18.16b
88 usubw v4.8h, v4.8h, v2.8b
89 usubw2 v20.8h, v20.8h, v2.16b
90 rshrn v4.8b, v4.8h, #3
91 rshrn2 v4.16b, v20.8h, #3
92 bsl v17.16b, v23.16b, v18.16b
93 bsl v19.16b, v28.16b, v2.16b
96 smin v4.16b, v4.16b, v21.16b
98 smax v4.16b, v4.16b, v23.16b
101 saddw v28.8h, v28.8h, v4.8b
102 saddw2 v21.8h, v21.8h, v4.16b
103 ssubw v22.8h, v22.8h, v4.8b
104 ssubw2 v24.8h, v24.8h, v4.16b
105 sqxtun v16.8b, v28.8h
106 sqxtun2 v16.16b, v21.8h
108 sqxtun2 v0.16b, v24.8h
111 function x264_deblock_v_luma_neon, export=1
112 h264_loop_filter_start
114 ld1 {v0.16b}, [x0], x1
115 ld1 {v2.16b}, [x0], x1
116 ld1 {v4.16b}, [x0], x1
117 sub x0, x0, x1, lsl #2
118 sub x0, x0, x1, lsl #1
119 ld1 {v20.16b}, [x0], x1
120 ld1 {v18.16b}, [x0], x1
121 ld1 {v16.16b}, [x0], x1
123 h264_loop_filter_luma
125 sub x0, x0, x1, lsl #1
126 st1 {v17.16b}, [x0], x1
127 st1 {v16.16b}, [x0], x1
128 st1 {v0.16b}, [x0], x1
134 function x264_deblock_h_luma_neon, export=1
135 h264_loop_filter_start
138 ld1 {v6.8b}, [x0], x1
139 ld1 {v20.8b}, [x0], x1
140 ld1 {v18.8b}, [x0], x1
141 ld1 {v16.8b}, [x0], x1
142 ld1 {v0.8b}, [x0], x1
143 ld1 {v2.8b}, [x0], x1
144 ld1 {v4.8b}, [x0], x1
145 ld1 {v26.8b}, [x0], x1
146 ld1 {v6.d}[1], [x0], x1
147 ld1 {v20.d}[1], [x0], x1
148 ld1 {v18.d}[1], [x0], x1
149 ld1 {v16.d}[1], [x0], x1
150 ld1 {v0.d}[1], [x0], x1
151 ld1 {v2.d}[1], [x0], x1
152 ld1 {v4.d}[1], [x0], x1
153 ld1 {v26.d}[1], [x0], x1
155 transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
157 h264_loop_filter_luma
159 transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
161 sub x0, x0, x1, lsl #4
163 st1 {v17.s}[0], [x0], x1
164 st1 {v16.s}[0], [x0], x1
165 st1 {v0.s}[0], [x0], x1
166 st1 {v19.s}[0], [x0], x1
167 st1 {v17.s}[1], [x0], x1
168 st1 {v16.s}[1], [x0], x1
169 st1 {v0.s}[1], [x0], x1
170 st1 {v19.s}[1], [x0], x1
171 st1 {v17.s}[2], [x0], x1
172 st1 {v16.s}[2], [x0], x1
173 st1 {v0.s}[2], [x0], x1
174 st1 {v19.s}[2], [x0], x1
175 st1 {v17.s}[3], [x0], x1
176 st1 {v16.s}[3], [x0], x1
177 st1 {v0.s}[3], [x0], x1
178 st1 {v19.s}[3], [x0], x1
183 .macro h264_loop_filter_start_intra
189 dup v30.16b, w2 // alpha
190 dup v31.16b, w3 // beta
193 .macro h264_loop_filter_luma_intra
194 uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
195 uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
196 uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
197 cmhi v19.16b, v30.16b, v16.16b // < alpha
198 cmhi v17.16b, v31.16b, v17.16b // < beta
199 cmhi v18.16b, v31.16b, v18.16b // < beta
202 ushr v30.16b, v30.16b, #2 // alpha >> 2
203 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
204 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
206 and v19.16b, v19.16b, v17.16b
207 and v19.16b, v19.16b, v18.16b
208 shrn v20.8b, v19.8h, #4
212 ushll v20.8h, v6.8b, #1
213 ushll v22.8h, v1.8b, #1
214 ushll2 v21.8h, v6.16b, #1
215 ushll2 v23.8h, v1.16b, #1
216 uaddw v20.8h, v20.8h, v7.8b
217 uaddw v22.8h, v22.8h, v0.8b
218 uaddw2 v21.8h, v21.8h, v7.16b
219 uaddw2 v23.8h, v23.8h, v0.16b
220 uaddw v20.8h, v20.8h, v1.8b
221 uaddw v22.8h, v22.8h, v6.8b
222 uaddw2 v21.8h, v21.8h, v1.16b
223 uaddw2 v23.8h, v23.8h, v6.16b
225 rshrn v24.8b, v20.8h, #2 // p0'_1
226 rshrn v25.8b, v22.8h, #2 // q0'_1
227 rshrn2 v24.16b, v21.8h, #2 // p0'_1
228 rshrn2 v25.16b, v23.8h, #2 // q0'_1
230 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
231 uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
232 cmhi v17.16b, v31.16b, v17.16b // < beta
233 cmhi v18.16b, v31.16b, v18.16b // < beta
235 and v17.16b, v16.16b, v17.16b // if_2 && if_3
236 and v18.16b, v16.16b, v18.16b // if_2 && if_4
241 and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
242 and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
244 and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
245 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
247 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248 uaddl v26.8h, v5.8b, v7.8b
249 uaddl2 v27.8h, v5.16b, v7.16b
250 uaddw v26.8h, v26.8h, v0.8b
251 uaddw2 v27.8h, v27.8h, v0.16b
252 add v20.8h, v20.8h, v26.8h
253 add v21.8h, v21.8h, v27.8h
254 uaddw v20.8h, v20.8h, v0.8b
255 uaddw2 v21.8h, v21.8h, v0.16b
256 rshrn v20.8b, v20.8h, #3 // p0'_2
257 rshrn2 v20.16b, v21.8h, #3 // p0'_2
258 uaddw v26.8h, v26.8h, v6.8b
259 uaddw2 v27.8h, v27.8h, v6.16b
260 rshrn v21.8b, v26.8h, #2 // p1'_2
261 rshrn2 v21.16b, v27.8h, #2 // p1'_2
262 uaddl v28.8h, v4.8b, v5.8b
263 uaddl2 v29.8h, v4.16b, v5.16b
264 shl v28.8h, v28.8h, #1
265 shl v29.8h, v29.8h, #1
266 add v28.8h, v28.8h, v26.8h
267 add v29.8h, v29.8h, v27.8h
268 rshrn v19.8b, v28.8h, #3 // p2'_2
269 rshrn2 v19.16b, v29.8h, #3 // p2'_2
271 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272 uaddl v26.8h, v2.8b, v0.8b
273 uaddl2 v27.8h, v2.16b, v0.16b
274 uaddw v26.8h, v26.8h, v7.8b
275 uaddw2 v27.8h, v27.8h, v7.16b
276 add v22.8h, v22.8h, v26.8h
277 add v23.8h, v23.8h, v27.8h
278 uaddw v22.8h, v22.8h, v7.8b
279 uaddw2 v23.8h, v23.8h, v7.16b
280 rshrn v22.8b, v22.8h, #3 // q0'_2
281 rshrn2 v22.16b, v23.8h, #3 // q0'_2
282 uaddw v26.8h, v26.8h, v1.8b
283 uaddw2 v27.8h, v27.8h, v1.16b
284 rshrn v23.8b, v26.8h, #2 // q1'_2
285 rshrn2 v23.16b, v27.8h, #2 // q1'_2
286 uaddl v28.8h, v2.8b, v3.8b
287 uaddl2 v29.8h, v2.16b, v3.16b
288 shl v28.8h, v28.8h, #1
289 shl v29.8h, v29.8h, #1
290 add v28.8h, v28.8h, v26.8h
291 add v29.8h, v29.8h, v27.8h
292 rshrn v26.8b, v28.8h, #3 // q2'_2
293 rshrn2 v26.16b, v29.8h, #3 // q2'_2
295 bit v7.16b, v24.16b, v30.16b // p0'_1
296 bit v0.16b, v25.16b, v31.16b // q0'_1
297 bit v7.16b, v20.16b, v17.16b // p0'_2
298 bit v6.16b, v21.16b, v17.16b // p1'_2
299 bit v5.16b, v19.16b, v17.16b // p2'_2
300 bit v0.16b, v22.16b, v18.16b // q0'_2
301 bit v1.16b, v23.16b, v18.16b // q1'_2
302 bit v2.16b, v26.16b, v18.16b // q2'_2
305 function x264_deblock_v_luma_intra_neon, export=1
306 h264_loop_filter_start_intra
308 ld1 {v0.16b}, [x0], x1 // q0
309 ld1 {v1.16b}, [x0], x1 // q1
310 ld1 {v2.16b}, [x0], x1 // q2
311 ld1 {v3.16b}, [x0], x1 // q3
312 sub x0, x0, x1, lsl #3
313 ld1 {v4.16b}, [x0], x1 // p3
314 ld1 {v5.16b}, [x0], x1 // p2
315 ld1 {v6.16b}, [x0], x1 // p1
316 ld1 {v7.16b}, [x0] // p0
318 h264_loop_filter_luma_intra
320 sub x0, x0, x1, lsl #1
321 st1 {v5.16b}, [x0], x1 // p2
322 st1 {v6.16b}, [x0], x1 // p1
323 st1 {v7.16b}, [x0], x1 // p0
324 st1 {v0.16b}, [x0], x1 // q0
325 st1 {v1.16b}, [x0], x1 // q1
326 st1 {v2.16b}, [x0] // q2
331 function x264_deblock_h_luma_intra_neon, export=1
332 h264_loop_filter_start_intra
335 ld1 {v4.8b}, [x0], x1
336 ld1 {v5.8b}, [x0], x1
337 ld1 {v6.8b}, [x0], x1
338 ld1 {v7.8b}, [x0], x1
339 ld1 {v0.8b}, [x0], x1
340 ld1 {v1.8b}, [x0], x1
341 ld1 {v2.8b}, [x0], x1
342 ld1 {v3.8b}, [x0], x1
343 ld1 {v4.d}[1], [x0], x1
344 ld1 {v5.d}[1], [x0], x1
345 ld1 {v6.d}[1], [x0], x1
346 ld1 {v7.d}[1], [x0], x1
347 ld1 {v0.d}[1], [x0], x1
348 ld1 {v1.d}[1], [x0], x1
349 ld1 {v2.d}[1], [x0], x1
350 ld1 {v3.d}[1], [x0], x1
352 transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
354 h264_loop_filter_luma_intra
356 transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
358 sub x0, x0, x1, lsl #4
359 st1 {v4.8b}, [x0], x1
360 st1 {v5.8b}, [x0], x1
361 st1 {v6.8b}, [x0], x1
362 st1 {v7.8b}, [x0], x1
363 st1 {v0.8b}, [x0], x1
364 st1 {v1.8b}, [x0], x1
365 st1 {v2.8b}, [x0], x1
366 st1 {v3.8b}, [x0], x1
367 st1 {v4.d}[1], [x0], x1
368 st1 {v5.d}[1], [x0], x1
369 st1 {v6.d}[1], [x0], x1
370 st1 {v7.d}[1], [x0], x1
371 st1 {v0.d}[1], [x0], x1
372 st1 {v1.d}[1], [x0], x1
373 st1 {v2.d}[1], [x0], x1
374 st1 {v3.d}[1], [x0], x1
379 .macro h264_loop_filter_chroma
380 dup v22.16b, w2 // alpha
382 uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
385 uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
386 usubw v4.8h, v4.8h, v16.8b
387 usubw2 v5.8h, v5.8h, v16.16b
388 sli v24.8h, v24.8h, #8
391 uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
393 uaddw v4.8h, v4.8h, v18.8b
394 uaddw2 v5.8h, v5.8h, v18.16b
395 cmhi v26.16b, v22.16b, v26.16b // < alpha
396 usubw v4.8h, v4.8h, v2.8b
397 usubw2 v5.8h, v5.8h, v2.16b
398 sli v24.4s, v24.4s, #16
399 dup v22.16b, w3 // beta
400 rshrn v4.8b, v4.8h, #3
401 rshrn2 v4.16b, v5.8h, #3
402 cmhi v28.16b, v22.16b, v28.16b // < beta
403 cmhi v30.16b, v22.16b, v30.16b // < beta
404 smin v4.16b, v4.16b, v24.16b
406 and v26.16b, v26.16b, v28.16b
407 smax v4.16b, v4.16b, v25.16b
408 and v26.16b, v26.16b, v30.16b
411 and v4.16b, v4.16b, v26.16b
413 uxtl2 v29.8h, v16.16b
414 saddw v28.8h, v28.8h, v4.8b
415 saddw2 v29.8h, v29.8h, v4.16b
416 ssubw v22.8h, v22.8h, v4.8b
417 ssubw2 v23.8h, v23.8h, v4.16b
418 sqxtun v16.8b, v28.8h
420 sqxtun2 v16.16b, v29.8h
421 sqxtun2 v0.16b, v23.8h
424 function x264_deblock_v_chroma_neon, export=1
425 h264_loop_filter_start
427 sub x0, x0, x1, lsl #1
428 ld1 {v18.16b}, [x0], x1
429 ld1 {v16.16b}, [x0], x1
430 ld1 {v0.16b}, [x0], x1
433 h264_loop_filter_chroma
435 sub x0, x0, x1, lsl #1
436 st1 {v16.16b}, [x0], x1
437 st1 {v0.16b}, [x0], x1
442 function x264_deblock_h_chroma_neon, export=1
443 h264_loop_filter_start
447 ld1 {v18.d}[0], [x0], x1
448 ld1 {v16.d}[0], [x0], x1
449 ld1 {v0.d}[0], [x0], x1
450 ld1 {v2.d}[0], [x0], x1
451 ld1 {v18.d}[1], [x0], x1
452 ld1 {v16.d}[1], [x0], x1
453 ld1 {v0.d}[1], [x0], x1
454 ld1 {v2.d}[1], [x0], x1
456 transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
458 h264_loop_filter_chroma
460 transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
462 sub x0, x0, x1, lsl #3
463 st1 {v18.d}[0], [x0], x1
464 st1 {v16.d}[0], [x0], x1
465 st1 {v0.d}[0], [x0], x1
466 st1 {v2.d}[0], [x0], x1
467 st1 {v18.d}[1], [x0], x1
468 st1 {v16.d}[1], [x0], x1
469 st1 {v0.d}[1], [x0], x1
470 st1 {v2.d}[1], [x0], x1
475 function x264_deblock_h_chroma_422_neon, export=1
479 bl X(x264_deblock_h_chroma_neon)
487 .macro h264_loop_filter_chroma8
488 dup v22.8b, w2 // alpha
490 uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
492 uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
493 usubw v4.8h, v4.8h, v16.8b
494 sli v24.8h, v24.8h, #8
496 uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
497 uaddw v4.8h, v4.8h, v18.8b
498 cmhi v26.8b, v22.8b, v26.8b // < alpha
499 usubw v4.8h, v4.8h, v19.8b
500 dup v22.8b, w3 // beta
501 rshrn v4.8b, v4.8h, #3
502 cmhi v28.8b, v22.8b, v28.8b // < beta
503 cmhi v30.8b, v22.8b, v30.8b // < beta
504 smin v4.8b, v4.8b, v24.8b
506 and v26.8b, v26.8b, v28.8b
507 smax v4.8b, v4.8b, v25.8b
508 and v26.8b, v26.8b, v30.8b
510 and v4.8b, v4.8b, v26.8b
512 saddw v28.8h, v28.8h, v4.8b
513 ssubw v22.8h, v22.8h, v4.8b
514 sqxtun v16.8b, v28.8h
515 sqxtun v17.8b, v22.8h
518 function x264_deblock_h_chroma_mbaff_neon, export=1
519 h264_loop_filter_start
524 ld1 {v18.8b}, [x4], x1
525 ld1 {v16.8b}, [x4], x1
526 ld1 {v17.8b}, [x4], x1
529 transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
531 h264_loop_filter_chroma8
533 st2 {v16.h,v17.h}[0], [x0], x1
534 st2 {v16.h,v17.h}[1], [x0], x1
535 st2 {v16.h,v17.h}[2], [x0], x1
536 st2 {v16.h,v17.h}[3], [x0]
541 .macro h264_loop_filter_chroma_intra, width=16
542 uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
543 uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
544 uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
545 cmhi v26.16b, v30.16b, v26.16b // < alpha
546 cmhi v27.16b, v31.16b, v27.16b // < beta
547 cmhi v28.16b, v31.16b, v28.16b // < beta
548 and v26.16b, v26.16b, v27.16b
549 and v26.16b, v26.16b, v28.16b
551 ushll v4.8h, v18.8b, #1
552 ushll v6.8h, v19.8b, #1
554 ushll2 v5.8h, v18.16b, #1
555 ushll2 v7.8h, v19.16b, #1
556 uaddl2 v21.8h, v16.16b, v19.16b
557 uaddl2 v23.8h, v17.16b, v18.16b
559 uaddl v20.8h, v16.8b, v19.8b
560 uaddl v22.8h, v17.8b, v18.8b
561 add v20.8h, v20.8h, v4.8h // mlal?
562 add v22.8h, v22.8h, v6.8h
564 add v21.8h, v21.8h, v5.8h
565 add v23.8h, v23.8h, v7.8h
567 uqrshrn v24.8b, v20.8h, #2
568 uqrshrn v25.8b, v22.8h, #2
570 uqrshrn2 v24.16b, v21.8h, #2
571 uqrshrn2 v25.16b, v23.8h, #2
573 bit v16.16b, v24.16b, v26.16b
574 bit v17.16b, v25.16b, v26.16b
577 function x264_deblock_v_chroma_intra_neon, export=1
578 h264_loop_filter_start_intra
580 sub x0, x0, x1, lsl #1
581 ld1 {v18.16b}, [x0], x1
582 ld1 {v16.16b}, [x0], x1
583 ld1 {v17.16b}, [x0], x1
586 h264_loop_filter_chroma_intra
588 sub x0, x0, x1, lsl #1
589 st1 {v16.16b}, [x0], x1
590 st1 {v17.16b}, [x0], x1
595 function x264_deblock_h_chroma_intra_mbaff_neon, export=1
596 h264_loop_filter_start_intra
600 ld1 {v18.8b}, [x4], x1
601 ld1 {v16.8b}, [x4], x1
602 ld1 {v17.8b}, [x4], x1
603 ld1 {v19.8b}, [x4], x1
605 transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
607 h264_loop_filter_chroma_intra, width=8
609 st2 {v16.h,v17.h}[0], [x0], x1
610 st2 {v16.h,v17.h}[1], [x0], x1
611 st2 {v16.h,v17.h}[2], [x0], x1
612 st2 {v16.h,v17.h}[3], [x0], x1
617 function x264_deblock_h_chroma_intra_neon, export=1
618 h264_loop_filter_start_intra
622 ld1 {v18.d}[0], [x4], x1
623 ld1 {v16.d}[0], [x4], x1
624 ld1 {v17.d}[0], [x4], x1
625 ld1 {v19.d}[0], [x4], x1
626 ld1 {v18.d}[1], [x4], x1
627 ld1 {v16.d}[1], [x4], x1
628 ld1 {v17.d}[1], [x4], x1
629 ld1 {v19.d}[1], [x4], x1
631 transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
633 h264_loop_filter_chroma_intra
635 st2 {v16.h,v17.h}[0], [x0], x1
636 st2 {v16.h,v17.h}[1], [x0], x1
637 st2 {v16.h,v17.h}[2], [x0], x1
638 st2 {v16.h,v17.h}[3], [x0], x1
639 st2 {v16.h,v17.h}[4], [x0], x1
640 st2 {v16.h,v17.h}[5], [x0], x1
641 st2 {v16.h,v17.h}[6], [x0], x1
642 st2 {v16.h,v17.h}[7], [x0], x1
647 function x264_deblock_h_chroma_422_intra_neon, export=1
648 h264_loop_filter_start_intra
652 ld1 {v18.d}[0], [x4], x1
653 ld1 {v16.d}[0], [x4], x1
654 ld1 {v17.d}[0], [x4], x1
655 ld1 {v19.d}[0], [x4], x1
656 ld1 {v18.d}[1], [x4], x1
657 ld1 {v16.d}[1], [x4], x1
658 ld1 {v17.d}[1], [x4], x1
659 ld1 {v19.d}[1], [x4], x1
661 transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
663 h264_loop_filter_chroma_intra
665 st2 {v16.h,v17.h}[0], [x0], x1
666 st2 {v16.h,v17.h}[1], [x0], x1
667 st2 {v16.h,v17.h}[2], [x0], x1
668 st2 {v16.h,v17.h}[3], [x0], x1
669 st2 {v16.h,v17.h}[4], [x0], x1
670 st2 {v16.h,v17.h}[5], [x0], x1
671 st2 {v16.h,v17.h}[6], [x0], x1
672 st2 {v16.h,v17.h}[7], [x0], x1
674 ld1 {v18.d}[0], [x4], x1
675 ld1 {v16.d}[0], [x4], x1
676 ld1 {v17.d}[0], [x4], x1
677 ld1 {v19.d}[0], [x4], x1
678 ld1 {v18.d}[1], [x4], x1
679 ld1 {v16.d}[1], [x4], x1
680 ld1 {v17.d}[1], [x4], x1
681 ld1 {v19.d}[1], [x4], x1
683 transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
685 h264_loop_filter_chroma_intra
687 st2 {v16.h,v17.h}[0], [x0], x1
688 st2 {v16.h,v17.h}[1], [x0], x1
689 st2 {v16.h,v17.h}[2], [x0], x1
690 st2 {v16.h,v17.h}[3], [x0], x1
691 st2 {v16.h,v17.h}[4], [x0], x1
692 st2 {v16.h,v17.h}[5], [x0], x1
693 st2 {v16.h,v17.h}[6], [x0], x1
694 st2 {v16.h,v17.h}[7], [x0], x1
699 //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
700 // int8_t ref[2][X264_SCAN8_LUMA_SIZE],
701 // int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
702 // uint8_t bs[2][8][4], int mvy_limit,
704 function x264_deblock_strength_neon, export=1
708 sub w4, w4, #(1<<8)-3
716 ld1 {v31.d}[1], [x1], #8
717 ld1 {v1.16b}, [x1], #16
719 ld1 {v2.16b}, [x1], #16
720 ext v3.16b, v0.16b, v1.16b, #15
721 ext v0.16b, v0.16b, v2.16b, #15
722 unzip v21.4s, v22.4s, v1.4s, v2.4s
723 unzip v23.4s, v20.4s, v3.4s, v0.4s
724 ext v21.16b, v31.16b, v22.16b, #12
726 eor v0.16b, v20.16b, v22.16b
727 eor v1.16b, v21.16b, v22.16b
728 orr v4.16b, v4.16b, v0.16b
729 orr v5.16b, v5.16b, v1.16b
731 ld1 {v21.8h}, [x2], #16 // mv + 0x10
732 ld1 {v19.8h}, [x2], #16 // mv + 0x20
733 ld1 {v22.8h}, [x2], #16 // mv + 0x30
734 ld1 {v18.8h}, [x2], #16 // mv + 0x40
735 ld1 {v23.8h}, [x2], #16 // mv + 0x50
736 ext v19.16b, v19.16b, v22.16b, #12
737 ext v18.16b, v18.16b, v23.16b, #12
738 sabd v0.8h, v22.8h, v19.8h
739 ld1 {v19.8h}, [x2], #16 // mv + 0x60
740 sabd v1.8h, v23.8h, v18.8h
741 ld1 {v24.8h}, [x2], #16 // mv + 0x70
743 ld1 {v18.8h}, [x2], #16 // mv + 0x80
744 ld1 {v25.8h}, [x2], #16 // mv + 0x90
746 ext v19.16b, v19.16b, v24.16b, #12
747 ext v18.16b, v18.16b, v25.16b, #12
748 sabd v1.8h, v24.8h, v19.8h
749 sabd v2.8h, v25.8h, v18.8h
753 uqsub v0.16b, v0.16b, v6.16b
754 uqsub v1.16b, v1.16b, v6.16b
758 sabd v1.8h, v22.8h, v23.8h
759 orr v4.16b, v4.16b, v0.16b
761 sabd v0.8h, v21.8h, v22.8h
762 sabd v2.8h, v23.8h, v24.8h
763 sabd v3.8h, v24.8h, v25.8h
769 uqsub v0.16b, v0.16b, v6.16b
770 uqsub v1.16b, v1.16b, v6.16b
774 orr v5.16b, v5.16b, v0.16b
779 ld1 {v31.d}[1], [x0], #8
780 ld1 {v1.16b}, [x0], #16
782 ld1 {v2.16b}, [x0], #16
783 ext v3.16b, v0.16b, v1.16b, #15
784 ext v0.16b, v0.16b, v2.16b, #15
785 unzip v21.4s, v22.4s, v1.4s, v2.4s
786 unzip v23.4s, v20.4s, v3.4s, v0.4s
787 ext v21.16b, v31.16b, v22.16b, #12
789 movrel x7, transpose_table
791 orr v0.16b, v20.16b, v22.16b
792 orr v1.16b, v21.16b, v22.16b
793 umin v0.16b, v0.16b, v6.16b
794 umin v1.16b, v1.16b, v6.16b
795 umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
796 umin v5.16b, v5.16b, v6.16b
797 add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
798 add v1.16b, v1.16b, v1.16b
799 umax v4.16b, v4.16b, v0.16b
800 umax v5.16b, v5.16b, v1.16b
801 tbl v6.16b, {v4.16b}, v7.16b
802 st1 {v5.16b}, [x3], x6 // bs[1]
803 st1 {v6.16b}, [x3] // bs[0]
807 const transpose_table