1 /****************************************************************************
2 * dct-a.S: aarch64 transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
29 const scan4x4_frame, align=4
30 .byte 0,1, 8,9, 2,3, 4,5
31 .byte 10,11, 16,17, 24,25, 18,19
32 .byte 12,13, 6,7, 14,15, 20,21
33 .byte 26,27, 28,29, 22,23, 30,31
36 const scan4x4_field, align=4
37 .byte 0,1, 2,3, 8,9, 4,5
38 .byte 6,7, 10,11, 12,13, 14,15
41 const sub4x4_frame, align=4
48 const sub4x4_field, align=4
55 // sum = a + (b>>shift) sub = (a>>shift) - b
56 .macro SUMSUB_SHR shift sum sub a b t0 t1
63 // sum = (a>>shift) + b sub = a - (b>>shift)
64 .macro SUMSUB_SHR2 shift sum sub a b t0 t1
71 // a += 1.5*ma b -= 1.5*mb
72 .macro SUMSUB_15 a b ma mb t0 t1
82 function x264_dct4x4dc_neon, export=1
83 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
85 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
86 SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
87 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
88 SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
89 transpose v4.4h, v6.4h, v0.4h, v2.4h
90 transpose v5.4h, v7.4h, v1.4h, v3.4h
91 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
92 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
93 transpose v4.2s, v5.2s, v0.2s, v1.2s
94 transpose v6.2s, v7.2s, v2.2s, v3.2s
95 add v16.4h, v4.4h, v31.4h
96 add v17.4h, v6.4h, v31.4h
97 srhadd v0.4h, v4.4h, v5.4h
98 shsub v1.4h, v16.4h, v5.4h
99 shsub v2.4h, v17.4h, v7.4h
100 srhadd v3.4h, v6.4h, v7.4h
101 st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
105 function x264_idct4x4dc_neon, export=1
106 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
107 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
108 SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
109 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
110 SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
111 transpose v4.4h, v6.4h, v0.4h, v2.4h
112 transpose v5.4h, v7.4h, v1.4h, v3.4h
113 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
114 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
115 transpose v4.2s, v5.2s, v0.2s, v1.2s
116 transpose v6.2s, v7.2s, v2.2s, v3.2s
117 SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
118 SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
119 st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
123 .macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
124 SUMSUB_AB \v1, \v6, \v5, \v6
125 SUMSUB_AB \v3, \v7, \v4, \v7
134 function x264_sub4x4_dct_neon, export=1
137 ld1 {v0.s}[0], [x1], x3
138 ld1 {v1.s}[0], [x2], x4
139 ld1 {v2.s}[0], [x1], x3
140 usubl v16.8h, v0.8b, v1.8b
141 ld1 {v3.s}[0], [x2], x4
142 ld1 {v4.s}[0], [x1], x3
143 usubl v17.8h, v2.8b, v3.8b
144 ld1 {v5.s}[0], [x2], x4
145 ld1 {v6.s}[0], [x1], x3
146 usubl v18.8h, v4.8b, v5.8b
147 ld1 {v7.s}[0], [x2], x4
148 usubl v19.8h, v6.8b, v7.8b
150 DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
151 transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
152 DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
153 st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
157 function x264_sub8x4_dct_neon
158 ld1 {v0.8b}, [x1], x3
159 ld1 {v1.8b}, [x2], x4
160 usubl v16.8h, v0.8b, v1.8b
161 ld1 {v2.8b}, [x1], x3
162 ld1 {v3.8b}, [x2], x4
163 usubl v17.8h, v2.8b, v3.8b
164 ld1 {v4.8b}, [x1], x3
165 ld1 {v5.8b}, [x2], x4
166 usubl v18.8h, v4.8b, v5.8b
167 ld1 {v6.8b}, [x1], x3
168 ld1 {v7.8b}, [x2], x4
169 usubl v19.8h, v6.8b, v7.8b
171 DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
172 transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
174 SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
175 SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
176 add v22.8h, v19.8h, v19.8h
177 add v21.8h, v18.8h, v18.8h
178 add v0.8h, v16.8h, v17.8h
179 sub v1.8h, v16.8h, v17.8h
181 add v2.8h, v22.8h, v18.8h
182 sub v3.8h, v19.8h, v21.8h
184 zip1 v4.2d, v0.2d, v2.2d
185 zip2 v6.2d, v0.2d, v2.2d
186 zip1 v5.2d, v1.2d, v3.2d
187 zip2 v7.2d, v1.2d, v3.2d
189 st1 {v4.8h}, [x0], #16
190 st1 {v5.8h}, [x0], #16
191 st1 {v6.8h}, [x0], #16
192 st1 {v7.8h}, [x0], #16
196 function x264_sub8x8_dct_neon, export=1
200 bl x264_sub8x4_dct_neon
202 b x264_sub8x4_dct_neon
205 function x264_sub16x16_dct_neon, export=1
209 bl x264_sub8x4_dct_neon
210 bl x264_sub8x4_dct_neon
211 sub x1, x1, #8*FENC_STRIDE-8
212 sub x2, x2, #8*FDEC_STRIDE-8
213 bl x264_sub8x4_dct_neon
214 bl x264_sub8x4_dct_neon
217 bl x264_sub8x4_dct_neon
218 bl x264_sub8x4_dct_neon
219 sub x1, x1, #8*FENC_STRIDE-8
220 sub x2, x2, #8*FDEC_STRIDE-8
221 bl x264_sub8x4_dct_neon
223 b x264_sub8x4_dct_neon
228 SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
229 SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
230 SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
231 SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
233 SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
234 SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
236 SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
237 sshr v23.8h, v21.8h, #1
238 sshr v18.8h, v16.8h, #1
239 add v23.8h, v23.8h, v21.8h
240 add v18.8h, v18.8h, v16.8h
241 sub v30.8h, v30.8h, v23.8h
242 sub v29.8h, v29.8h, v18.8h
244 SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
245 sshr v22.8h, v20.8h, #1
246 sshr v19.8h, v17.8h, #1
247 add v22.8h, v22.8h, v20.8h
248 add v19.8h, v19.8h, v17.8h
249 add v22.8h, v28.8h, v22.8h
250 add v31.8h, v31.8h, v19.8h
252 SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
253 SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
254 SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
255 SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
258 function x264_sub8x8_dct8_neon, export=1
261 ld1 {v16.8b}, [x1], x3
262 ld1 {v17.8b}, [x2], x4
263 ld1 {v18.8b}, [x1], x3
264 ld1 {v19.8b}, [x2], x4
265 usubl v0.8h, v16.8b, v17.8b
266 ld1 {v20.8b}, [x1], x3
267 ld1 {v21.8b}, [x2], x4
268 usubl v1.8h, v18.8b, v19.8b
269 ld1 {v22.8b}, [x1], x3
270 ld1 {v23.8b}, [x2], x4
271 usubl v2.8h, v20.8b, v21.8b
272 ld1 {v24.8b}, [x1], x3
273 ld1 {v25.8b}, [x2], x4
274 usubl v3.8h, v22.8b, v23.8b
275 ld1 {v26.8b}, [x1], x3
276 ld1 {v27.8b}, [x2], x4
277 usubl v4.8h, v24.8b, v25.8b
278 ld1 {v28.8b}, [x1], x3
279 ld1 {v29.8b}, [x2], x4
280 usubl v5.8h, v26.8b, v27.8b
281 ld1 {v30.8b}, [x1], x3
282 ld1 {v31.8b}, [x2], x4
283 usubl v6.8h, v28.8b, v29.8b
284 usubl v7.8h, v30.8b, v31.8b
287 transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
290 st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
291 st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
295 function x264_sub16x16_dct8_neon, export=1
297 bl X(x264_sub8x8_dct8_neon)
298 sub x1, x1, #FENC_STRIDE*8 - 8
299 sub x2, x2, #FDEC_STRIDE*8 - 8
300 bl X(x264_sub8x8_dct8_neon)
303 bl X(x264_sub8x8_dct8_neon)
305 sub x1, x1, #FENC_STRIDE*8 - 8
306 sub x2, x2, #FDEC_STRIDE*8 - 8
307 b X(x264_sub8x8_dct8_neon)
311 // First part of IDCT (minus final SUMSUB_BA)
312 .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
313 SUMSUB_AB \d4, \d5, \d0, \d2
320 function x264_add4x4_idct_neon, export=1
322 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
324 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
325 ld1 {v28.s}[0], [x0], x2
326 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
327 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
329 transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
331 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
332 ld1 {v29.s}[0], [x0], x2
333 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
334 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
336 srshr v0.4h, v0.4h, #6
337 srshr v1.4h, v1.4h, #6
338 ld1 {v31.s}[0], [x0], x2
339 srshr v2.4h, v2.4h, #6
340 srshr v3.4h, v3.4h, #6
341 ld1 {v30.s}[0], [x0], x2
343 sub x0, x0, x2, lsl #2
344 uaddw v0.8h, v0.8h, v28.8b
345 uaddw v1.8h, v1.8h, v29.8b
346 uaddw v2.8h, v2.8h, v30.8b
347 uaddw v3.8h, v3.8h, v31.8b
353 st1 {v0.s}[0], [x0], x2
354 st1 {v1.s}[0], [x0], x2
355 st1 {v3.s}[0], [x0], x2
356 st1 {v2.s}[0], [x0], x2
360 function x264_add8x4_idct_neon, export=1
361 ld1 {v0.8h,v1.8h}, [x1], #32
362 ld1 {v2.8h,v3.8h}, [x1], #32
363 transpose v20.2d, v21.2d, v0.2d, v2.2d
364 transpose v22.2d, v23.2d, v1.2d, v3.2d
365 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
366 SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
367 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
369 transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
371 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
372 SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
373 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
375 srshr v0.8h, v0.8h, #6
376 ld1 {v28.8b}, [x0], x2
377 srshr v1.8h, v1.8h, #6
378 ld1 {v29.8b}, [x0], x2
379 srshr v2.8h, v2.8h, #6
380 ld1 {v30.8b}, [x0], x2
381 srshr v3.8h, v3.8h, #6
382 ld1 {v31.8b}, [x0], x2
384 sub x0, x0, x2, lsl #2
385 uaddw v0.8h, v0.8h, v28.8b
386 uaddw v1.8h, v1.8h, v29.8b
387 uaddw v2.8h, v2.8h, v30.8b
388 uaddw v3.8h, v3.8h, v31.8b
392 st1 {v0.8b}, [x0], x2
394 st1 {v1.8b}, [x0], x2
396 st1 {v2.8b}, [x0], x2
397 st1 {v3.8b}, [x0], x2
401 function x264_add8x8_idct_neon, export=1
404 bl X(x264_add8x4_idct_neon)
406 b X(x264_add8x4_idct_neon)
409 function x264_add16x16_idct_neon, export=1
412 bl X(x264_add8x4_idct_neon)
413 bl X(x264_add8x4_idct_neon)
414 sub x0, x0, #8*FDEC_STRIDE-8
415 bl X(x264_add8x4_idct_neon)
416 bl X(x264_add8x4_idct_neon)
418 bl X(x264_add8x4_idct_neon)
419 bl X(x264_add8x4_idct_neon)
420 sub x0, x0, #8*FDEC_STRIDE-8
421 bl X(x264_add8x4_idct_neon)
423 b X(x264_add8x4_idct_neon)
427 SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
429 ld1 {v22.8h,v23.8h}, [x1], #32
431 SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
432 SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
433 SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
434 SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
435 SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
437 SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
438 SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
440 SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
441 SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
443 SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
444 SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
445 SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
446 SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
449 function x264_add8x8_idct8_neon, export=1
451 ld1 {v16.8h,v17.8h}, [x1], #32
452 ld1 {v18.8h,v19.8h}, [x1], #32
453 ld1 {v20.8h,v21.8h}, [x1], #32
457 transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
461 ld1 {v0.8b}, [x0], x2
462 srshr v16.8h, v16.8h, #6
463 ld1 {v1.8b}, [x0], x2
464 srshr v17.8h, v17.8h, #6
465 ld1 {v2.8b}, [x0], x2
466 srshr v18.8h, v18.8h, #6
467 ld1 {v3.8b}, [x0], x2
468 srshr v19.8h, v19.8h, #6
469 ld1 {v4.8b}, [x0], x2
470 srshr v20.8h, v20.8h, #6
471 ld1 {v5.8b}, [x0], x2
472 srshr v21.8h, v21.8h, #6
473 ld1 {v6.8b}, [x0], x2
474 srshr v22.8h, v22.8h, #6
475 ld1 {v7.8b}, [x0], x2
476 srshr v23.8h, v23.8h, #6
477 sub x0, x0, x2, lsl #3
479 uaddw v16.8h, v16.8h, v0.8b
480 uaddw v17.8h, v17.8h, v1.8b
481 uaddw v18.8h, v18.8h, v2.8b
485 uaddw v19.8h, v19.8h, v3.8b
486 st1 {v0.8b}, [x0], x2
487 uaddw v20.8h, v20.8h, v4.8b
488 st1 {v1.8b}, [x0], x2
489 uaddw v21.8h, v21.8h, v5.8b
490 st1 {v2.8b}, [x0], x2
493 uaddw v22.8h, v22.8h, v6.8b
494 uaddw v23.8h, v23.8h, v7.8b
495 st1 {v3.8b}, [x0], x2
497 st1 {v4.8b}, [x0], x2
500 st1 {v5.8b}, [x0], x2
501 st1 {v6.8b}, [x0], x2
502 st1 {v7.8b}, [x0], x2
506 function x264_add16x16_idct8_neon, export=1
508 bl X(x264_add8x8_idct8_neon)
509 sub x0, x0, #8*FDEC_STRIDE-8
510 bl X(x264_add8x8_idct8_neon)
512 bl X(x264_add8x8_idct8_neon)
513 sub x0, x0, #8*FDEC_STRIDE-8
515 b X(x264_add8x8_idct8_neon)
518 function x264_add8x8_idct_dc_neon, export=1
521 ld1 {v0.8b}, [x0], x2
522 srshr v16.4h, v16.4h, #6
523 ld1 {v1.8b}, [x0], x2
526 ld1 {v2.8b}, [x0], x2
529 ld1 {v3.8b}, [x0], x2
530 trn1 v20.2d, v20.2d, v21.2d
531 ld1 {v4.8b}, [x0], x2
532 trn1 v21.2d, v22.2d, v23.2d
533 ld1 {v5.8b}, [x0], x2
535 ld1 {v6.8b}, [x0], x2
537 ld1 {v7.8b}, [x0], x2
539 sub x0, x0, #8*FDEC_STRIDE
541 sqxtun v20.8b, v20.8h
542 sqxtun v21.8b, v21.8h
543 sqxtun v22.8b, v22.8h
544 sqxtun v23.8b, v23.8h
546 uqadd v0.8b, v0.8b, v20.8b
547 uqadd v1.8b, v1.8b, v20.8b
548 uqadd v2.8b, v2.8b, v20.8b
549 uqadd v3.8b, v3.8b, v20.8b
550 uqadd v4.8b, v4.8b, v21.8b
551 uqadd v5.8b, v5.8b, v21.8b
552 uqadd v6.8b, v6.8b, v21.8b
553 uqadd v7.8b, v7.8b, v21.8b
554 uqsub v0.8b, v0.8b, v22.8b
555 uqsub v1.8b, v1.8b, v22.8b
556 uqsub v2.8b, v2.8b, v22.8b
557 uqsub v3.8b, v3.8b, v22.8b
558 uqsub v4.8b, v4.8b, v23.8b
559 uqsub v5.8b, v5.8b, v23.8b
560 uqsub v6.8b, v6.8b, v23.8b
561 uqsub v7.8b, v7.8b, v23.8b
563 st1 {v0.8b}, [x0], x2
564 st1 {v1.8b}, [x0], x2
565 st1 {v2.8b}, [x0], x2
566 st1 {v3.8b}, [x0], x2
567 st1 {v4.8b}, [x0], x2
568 st1 {v5.8b}, [x0], x2
569 st1 {v6.8b}, [x0], x2
570 st1 {v7.8b}, [x0], x2
574 .macro ADD16x4_IDCT_DC dc
575 ld1 {v4.16b}, [x0], x3
578 ld1 {v5.16b}, [x0], x3
581 ld1 {v6.16b}, [x0], x3
582 trn1 v24.2d, v24.2d, v25.2d
583 ld1 {v7.16b}, [x0], x3
584 trn1 v25.2d, v26.2d, v27.2d
588 sqxtun v20.8b, v24.8h
589 sqxtun v21.8b, v26.8h
590 sqxtun2 v20.16b, v25.8h
591 sqxtun2 v21.16b, v27.8h
593 uqadd v4.16b, v4.16b, v20.16b
594 uqadd v5.16b, v5.16b, v20.16b
595 uqadd v6.16b, v6.16b, v20.16b
596 uqadd v7.16b, v7.16b, v20.16b
598 uqsub v4.16b, v4.16b, v21.16b
599 uqsub v5.16b, v5.16b, v21.16b
600 uqsub v6.16b, v6.16b, v21.16b
601 st1 {v4.16b}, [x2], x3
602 uqsub v7.16b, v7.16b, v21.16b
603 st1 {v5.16b}, [x2], x3
604 st1 {v6.16b}, [x2], x3
605 st1 {v7.16b}, [x2], x3
608 function x264_add16x16_idct_dc_neon, export=1
612 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
613 srshr v0.4h, v0.4h, #6
614 srshr v1.4h, v1.4h, #6
617 srshr v2.4h, v2.4h, #6
619 srshr v3.4h, v3.4h, #6
625 function x264_sub8x8_dct_dc_neon, export=1
628 ld1 {v16.8b}, [x1], x3
629 ld1 {v17.8b}, [x2], x4
630 usubl v16.8h, v16.8b, v17.8b
631 ld1 {v18.8b}, [x1], x3
632 ld1 {v19.8b}, [x2], x4
633 usubl v17.8h, v18.8b, v19.8b
634 ld1 {v20.8b}, [x1], x3
635 ld1 {v21.8b}, [x2], x4
636 usubl v18.8h, v20.8b, v21.8b
637 ld1 {v22.8b}, [x1], x3
638 add v0.8h, v16.8h, v17.8h
639 ld1 {v23.8b}, [x2], x4
640 usubl v19.8h, v22.8b, v23.8b
641 ld1 {v24.8b}, [x1], x3
642 add v0.8h, v0.8h, v18.8h
643 ld1 {v25.8b}, [x2], x4
644 usubl v20.8h, v24.8b, v25.8b
645 ld1 {v26.8b}, [x1], x3
646 add v0.8h, v0.8h, v19.8h
647 ld1 {v27.8b}, [x2], x4
648 usubl v21.8h, v26.8b, v27.8b
649 ld1 {v28.8b}, [x1], x3
650 ld1 {v29.8b}, [x2], x4
651 usubl v22.8h, v28.8b, v29.8b
652 ld1 {v30.8b}, [x1], x3
653 add v1.8h, v20.8h, v21.8h
654 ld1 {v31.8b}, [x2], x4
655 usubl v23.8h, v30.8b, v31.8b
656 add v1.8h, v1.8h, v22.8h
657 add v1.8h, v1.8h, v23.8h
659 transpose v2.2d, v3.2d, v0.2d, v1.2d
661 add v0.8h, v2.8h, v3.8h
662 sub v1.8h, v2.8h, v3.8h
664 transpose v2.2d, v3.2d, v0.2d, v1.2d
666 add v0.8h, v2.8h, v3.8h
667 sub v1.8h, v2.8h, v3.8h
669 transpose v2.2d, v3.2d, v0.2d, v1.2d
671 addp v0.8h, v2.8h, v3.8h
672 addp v0.8h, v0.8h, v0.8h
678 function x264_zigzag_interleave_8x8_cavlc_neon, export=1
681 ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
682 ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
683 umax v16.8h, v0.8h, v4.8h
684 umax v17.8h, v1.8h, v5.8h
685 umax v18.8h, v2.8h, v6.8h
686 umax v19.8h, v3.8h, v7.8h
687 st1 {v0.8h}, [x0], #16
688 st1 {v4.8h}, [x0], #16
689 umaxp v16.8h, v16.8h, v17.8h
690 umaxp v18.8h, v18.8h, v19.8h
691 st1 {v1.8h}, [x0], #16
692 st1 {v5.8h}, [x0], #16
693 umaxp v16.8h, v16.8h, v18.8h
694 st1 {v2.8h}, [x0], #16
695 st1 {v6.8h}, [x0], #16
696 cmhi v16.4s, v16.4s, v31.4s
697 st1 {v3.8h}, [x0], #16
698 and v16.16b, v16.16b, v31.16b
699 st1 {v7.8h}, [x0], #16
700 st1 {v16.b}[0], [x2], #1
701 st1 {v16.b}[4], [x2], x3
702 st1 {v16.b}[8], [x2], #1
703 st1 {v16.b}[12], [x2]
707 function x264_zigzag_scan_4x4_frame_neon, export=1
708 movrel x2, scan4x4_frame
709 ld1 {v0.16b,v1.16b}, [x1]
710 ld1 {v16.16b,v17.16b}, [x2]
711 tbl v2.16b, {v0.16b,v1.16b}, v16.16b
712 tbl v3.16b, {v0.16b,v1.16b}, v17.16b
713 st1 {v2.16b,v3.16b}, [x0]
717 .macro zigzag_sub_4x4 f ac
718 function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
723 ld1 {v0.s}[0], [x1], x9
724 ld1 {v0.s}[1], [x1], x9
725 ld1 {v0.s}[2], [x1], x9
726 ld1 {v0.s}[3], [x1], x9
728 ld1 {v1.s}[0], [x2], x4
729 ld1 {v1.s}[1], [x2], x4
730 ld1 {v1.s}[2], [x2], x4
731 ld1 {v1.s}[3], [x2], x4
732 tbl v2.16b, {v0.16b}, v16.16b
733 tbl v3.16b, {v1.16b}, v16.16b
734 st1 {v0.s}[0], [x6], x4
735 usubl v4.8h, v2.8b, v3.8b
742 usubl2 v5.8h, v2.16b, v3.16b
743 st1 {v0.s}[1], [x6], x4
744 umax v6.8h, v4.8h, v5.8h
746 st1 {v0.s}[2], [x6], x4
748 st1 {v0.s}[3], [x6], x4
750 st1 {v4.8h,v5.8h}, [x0]
757 zigzag_sub_4x4 field, ac
759 zigzag_sub_4x4 frame, ac
761 function x264_zigzag_scan_4x4_field_neon, export=1
762 movrel x2, scan4x4_field
763 ld1 {v0.8h,v1.8h}, [x1]
765 tbl v0.16b, {v0.16b}, v16.16b
766 st1 {v0.8h,v1.8h}, [x0]
770 function x264_zigzag_scan_8x8_frame_neon, export=1
771 movrel x2, scan8x8_frame
772 ld1 {v0.8h,v1.8h}, [x1], #32
773 ld1 {v2.8h,v3.8h}, [x1], #32
774 ld1 {v4.8h,v5.8h}, [x1], #32
775 ld1 {v6.8h,v7.8h}, [x1]
776 ld1 {v16.16b,v17.16b}, [x2], #32
777 ld1 {v18.16b,v19.16b}, [x2], #32
778 ld1 {v20.16b,v21.16b}, [x2], #32
779 ld1 {v22.16b,v23.16b}, [x2], #32
780 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
781 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
782 tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
783 tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
784 tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
785 tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
786 tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
787 tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
788 mov v25.h[6], v4.h[0]
789 mov v25.h[7], v5.h[0]
790 mov v26.h[0], v4.h[1]
791 mov v27.h[4], v7.h[0]
792 mov v28.h[7], v4.h[4]
793 mov v29.h[7], v3.h[6]
794 mov v30.h[0], v2.h[7]
795 mov v30.h[1], v3.h[7]
796 st1 {v24.8h,v25.8h}, [x0], #32
797 st1 {v26.8h,v27.8h}, [x0], #32
798 st1 {v28.8h,v29.8h}, [x0], #32
799 st1 {v30.8h,v31.8h}, [x0]
803 #define Z(z) 2*(z), 2*(z)+1
804 #define T(x,y) Z(x*8+y)
805 const scan8x8_frame, align=5
806 .byte T(0,0), T(1,0), T(0,1), T(0,2)
807 .byte T(1,1), T(2,0), T(3,0), T(2,1)
808 .byte T(1,2), T(0,3), T(0,4), T(1,3)
809 .byte T(2,2), T(3,1), T(4,0), T(5,0)
810 .byte T(4,1), T(3,2), T(2,3), T(1,4)
811 .byte T(0,5), T(0,6), T(1,5), T(2,4)
813 #define T(x,y) Z((x-3)*8+y)
814 .byte T(3,3), T(4,2), T(5,1), T(6,0)
815 .byte T(7,0), T(6,1), T(5,2), T(4,3)
817 #define T(x,y) Z((x-0)*8+y)
818 .byte T(3,4), T(2,5), T(1,6), T(0,7)
819 .byte T(1,7), T(2,6), T(3,5), T(4,4)
821 #define T(x,y) Z((x-4)*8+y)
822 .byte T(5,3), T(6,2), T(7,1), T(7,2)
823 .byte T(6,3), T(5,4), T(4,5), T(3,6)
824 .byte T(2,7), T(3,7), T(4,6), T(5,5)
825 .byte T(6,4), T(7,3), T(7,4), T(6,5)
826 .byte T(5,6), T(4,7), T(5,7), T(6,6)
827 .byte T(7,5), T(7,6), T(6,7), T(7,7)
830 function x264_zigzag_scan_8x8_field_neon, export=1
831 movrel x2, scan8x8_field
832 ld1 {v0.8h,v1.8h}, [x1], #32
833 ld1 {v2.8h,v3.8h}, [x1], #32
834 ld1 {v4.8h,v5.8h}, [x1], #32
835 ld1 {v6.8h,v7.8h}, [x1]
836 ld1 {v16.16b,v17.16b}, [x2], #32
837 ld1 {v18.16b,v19.16b}, [x2], #32
838 ld1 {v20.16b,v21.16b}, [x2], #32
840 ext v31.16b, v7.16b, v7.16b, #4
841 tbl v24.16b, {v0.16b,v1.16b}, v16.16b
842 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
843 tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
844 tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
845 tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
846 tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
847 tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
848 ext v31.16b, v6.16b, v31.16b, #12
849 st1 {v24.8h,v25.8h}, [x0], #32
850 st1 {v26.8h,v27.8h}, [x0], #32
851 st1 {v28.8h,v29.8h}, [x0], #32
852 st1 {v30.8h,v31.8h}, [x0]
856 .macro zigzag_sub8x8 f
857 function x264_zigzag_sub_8x8_\f\()_neon, export=1
862 ld1 {v0.d}[0], [x1], x5
863 ld1 {v0.d}[1], [x1], x5
864 ld1 {v1.d}[0], [x1], x5
865 ld1 {v1.d}[1], [x1], x5
866 ld1 {v2.d}[0], [x1], x5
867 ld1 {v2.d}[1], [x1], x5
868 ld1 {v3.d}[0], [x1], x5
870 ld1 {v4.d}[0], [x2], x6
871 ld1 {v4.d}[1], [x2], x6
872 ld1 {v5.d}[0], [x2], x6
873 ld1 {v5.d}[1], [x2], x6
874 ld1 {v6.d}[0], [x2], x6
875 ld1 {v6.d}[1], [x2], x6
876 ld1 {v7.d}[0], [x2], x6
878 ld1 {v16.16b,v17.16b}, [x4], #32
879 ld1 {v18.16b,v19.16b}, [x4], #32
880 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
881 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
882 tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
883 tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
884 tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
885 tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
886 tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
887 tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
888 usubl v4.8h, v24.8b, v28.8b
889 usubl2 v5.8h, v24.16b, v28.16b
890 usubl v6.8h, v25.8b, v29.8b
891 usubl2 v7.8h, v25.16b, v29.16b
892 usubl v16.8h, v26.8b, v30.8b
893 usubl2 v17.8h, v26.16b, v30.16b
894 usubl v18.8h, v27.8b, v31.8b
895 usubl2 v19.8h, v27.16b, v31.16b
896 umax v20.8h, v4.8h, v5.8h
897 umax v21.8h, v6.8h, v7.8h
898 umax v22.8h, v16.8h, v17.8h
899 umax v23.8h, v18.8h, v19.8h
900 umax v20.8h, v20.8h, v21.8h
901 umax v21.8h, v22.8h, v23.8h
902 umax v20.8h, v20.8h, v21.8h
904 st1 {v0.d}[0], [x7], x6
905 st1 {v0.d}[1], [x7], x6
906 st1 {v1.d}[0], [x7], x6
907 st1 {v1.d}[1], [x7], x6
908 st1 {v2.d}[0], [x7], x6
909 st1 {v2.d}[1], [x7], x6
910 st1 {v3.d}[0], [x7], x6
912 st1 {v4.8h,v5.8h}, [x0], #32
913 st1 {v6.8h,v7.8h}, [x0], #32
914 st1 {v16.8h,v17.8h}, [x0], #32
915 st1 {v18.8h,v19.8h}, [x0]
927 #define T(x,y) Z(x*8+y)
928 const scan8x8_field, align=5
929 .byte T(0,0), T(0,1), T(0,2), T(1,0)
930 .byte T(1,1), T(0,3), T(0,4), T(1,2)
931 .byte T(2,0), T(1,3), T(0,5), T(0,6)
932 .byte T(0,7), T(1,4), T(2,1), T(3,0)
934 #define T(x,y) Z((x-1)*8+y)
935 .byte T(2,2), T(1,5), T(1,6), T(1,7)
936 .byte T(2,3), T(3,1), T(4,0), T(3,2)
938 #define T(x,y) Z((x-2)*8+y)
939 .byte T(2,4), T(2,5), T(2,6), T(2,7)
940 .byte T(3,3), T(4,1), T(5,0), T(4,2)
942 #define T(x,y) Z((x-3)*8+y)
943 .byte T(3,4), T(3,5), T(3,6), T(3,7)
944 .byte T(4,3), T(5,1), T(6,0), T(5,2)
946 #define T(x,y) Z((x-4)*8+y)
947 .byte T(4,4), T(4,5), T(4,6), T(4,7)
948 .byte T(5,3), T(6,1), T(6,2), T(5,4)
950 #define T(x,y) Z((x-5)*8+y)
951 .byte T(5,5), T(5,6), T(5,7), T(6,3)
952 .byte T(7,0), T(7,1), T(6,4), T(6,5)
958 const sub8x8_frame, align=5
959 .byte T(0,0), T(1,0), T(0,1), T(0,2)
960 .byte T(1,1), T(2,0), T(3,0), T(2,1)
961 .byte T(1,2), T(0,3), T(0,4), T(1,3)
962 .byte T(2,2), T(3,1), T(4,0), T(5,0)
963 .byte T(4,1), T(3,2), T(2,3), T(1,4)
964 .byte T(0,5), T(0,6), T(1,5), T(2,4)
965 .byte T(3,3), T(4,2), T(5,1), T(6,0)
966 .byte T(7,0), T(6,1), T(5,2), T(4,3)
967 .byte T(3,4), T(2,5), T(1,6), T(0,7)
968 .byte T(1,7), T(2,6), T(3,5), T(4,4)
969 .byte T(5,3), T(6,2), T(7,1), T(7,2)
970 .byte T(6,3), T(5,4), T(4,5), T(3,6)
971 .byte T(2,7), T(3,7), T(4,6), T(5,5)
972 .byte T(6,4), T(7,3), T(7,4), T(6,5)
973 .byte T(5,6), T(4,7), T(5,7), T(6,6)
974 .byte T(7,5), T(7,6), T(6,7), T(7,7)
977 const sub8x8_field, align=5
978 .byte T(0,0), T(0,1), T(0,2), T(1,0)
979 .byte T(1,1), T(0,3), T(0,4), T(1,2)
980 .byte T(2,0), T(1,3), T(0,5), T(0,6)
981 .byte T(0,7), T(1,4), T(2,1), T(3,0)
982 .byte T(2,2), T(1,5), T(1,6), T(1,7)
983 .byte T(2,3), T(3,1), T(4,0), T(3,2)
984 .byte T(2,4), T(2,5), T(2,6), T(2,7)
985 .byte T(3,3), T(4,1), T(5,0), T(4,2)
986 .byte T(3,4), T(3,5), T(3,6), T(3,7)
987 .byte T(4,3), T(5,1), T(6,0), T(5,2)
988 .byte T(4,4), T(4,5), T(4,6), T(4,7)
989 .byte T(5,3), T(6,1), T(6,2), T(5,4)
990 .byte T(5,5), T(5,6), T(5,7), T(6,3)
991 .byte T(7,0), T(7,1), T(6,4), T(6,5)
992 .byte T(6,6), T(6,7), T(7,2), T(7,3)
993 .byte T(7,4), T(7,5), T(7,6), T(7,7)