1 /****************************************************************************
2 * dct-a.S: aarch64 transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2009-2016 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
29 const scan4x4_frame, align=4
30 .byte 0,1, 8,9, 2,3, 4,5
31 .byte 10,11, 16,17, 24,25, 18,19
32 .byte 12,13, 6,7, 14,15, 20,21
33 .byte 26,27, 28,29, 22,23, 30,31
36 const scan4x4_field, align=4
37 .byte 0,1, 2,3, 8,9, 4,5
38 .byte 6,7, 10,11, 12,13, 14,15
41 const sub4x4_frame, align=4
48 const sub4x4_field, align=4
55 // sum = a + (b>>shift) sub = (a>>shift) - b
56 .macro SUMSUB_SHR shift sum sub a b t0 t1
63 // sum = (a>>shift) + b sub = a - (b>>shift)
64 .macro SUMSUB_SHR2 shift sum sub a b t0 t1
71 // a += 1.5*ma b -= 1.5*mb
72 .macro SUMSUB_15 a b ma mb t0 t1
82 function x264_dct4x4dc_neon, export=1
83 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
85 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
86 SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
87 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
88 SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
89 transpose v4.4h, v6.4h, v0.4h, v2.4h
90 transpose v5.4h, v7.4h, v1.4h, v3.4h
91 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
92 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
93 transpose v4.2s, v5.2s, v0.2s, v1.2s
94 transpose v6.2s, v7.2s, v2.2s, v3.2s
95 add v16.4h, v4.4h, v31.4h
96 add v17.4h, v6.4h, v31.4h
97 srhadd v0.4h, v4.4h, v5.4h
98 shsub v1.4h, v16.4h, v5.4h
99 shsub v2.4h, v17.4h, v7.4h
100 srhadd v3.4h, v6.4h, v7.4h
101 st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
105 function x264_idct4x4dc_neon, export=1
106 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
107 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
108 SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
109 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
110 SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
111 transpose v4.4h, v6.4h, v0.4h, v2.4h
112 transpose v5.4h, v7.4h, v1.4h, v3.4h
113 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
114 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
115 transpose v4.2s, v5.2s, v0.2s, v1.2s
116 transpose v6.2s, v7.2s, v2.2s, v3.2s
117 SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
118 SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
119 st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
123 .macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
124 SUMSUB_AB \v1, \v6, \v5, \v6
125 SUMSUB_AB \v3, \v7, \v4, \v7
134 function x264_sub4x4_dct_neon, export=1
137 ld1 {v0.s}[0], [x1], x3
138 ld1 {v1.s}[0], [x2], x4
139 ld1 {v2.s}[0], [x1], x3
140 usubl v16.8h, v0.8b, v1.8b
141 ld1 {v3.s}[0], [x2], x4
142 ld1 {v4.s}[0], [x1], x3
143 usubl v17.8h, v2.8b, v3.8b
144 ld1 {v5.s}[0], [x2], x4
145 ld1 {v6.s}[0], [x1], x3
146 usubl v18.8h, v4.8b, v5.8b
147 ld1 {v7.s}[0], [x2], x4
148 usubl v19.8h, v6.8b, v7.8b
150 DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
151 transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
152 DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
153 st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
157 function x264_sub8x4_dct_neon
158 ld1 {v0.8b}, [x1], x3
159 ld1 {v1.8b}, [x2], x4
160 usubl v16.8h, v0.8b, v1.8b
161 ld1 {v2.8b}, [x1], x3
162 ld1 {v3.8b}, [x2], x4
163 usubl v17.8h, v2.8b, v3.8b
164 ld1 {v4.8b}, [x1], x3
165 ld1 {v5.8b}, [x2], x4
166 usubl v18.8h, v4.8b, v5.8b
167 ld1 {v6.8b}, [x1], x3
168 ld1 {v7.8b}, [x2], x4
169 usubl v19.8h, v6.8b, v7.8b
171 DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
172 transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
174 SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
175 SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
176 add v22.8h, v19.8h, v19.8h
177 add v21.8h, v18.8h, v18.8h
178 add v0.8h, v16.8h, v17.8h
179 sub v1.8h, v16.8h, v17.8h
181 add v2.8h, v22.8h, v18.8h
182 sub v3.8h, v19.8h, v21.8h
184 zip1 v4.2d, v0.2d, v2.2d
185 zip2 v6.2d, v0.2d, v2.2d
186 zip1 v5.2d, v1.2d, v3.2d
187 zip2 v7.2d, v1.2d, v3.2d
189 st1 {v4.8h}, [x0], #16
190 st1 {v5.8h}, [x0], #16
191 st1 {v6.8h}, [x0], #16
192 st1 {v7.8h}, [x0], #16
196 function x264_sub8x8_dct_neon, export=1
200 bl x264_sub8x4_dct_neon
202 b x264_sub8x4_dct_neon
205 function x264_sub16x16_dct_neon, export=1
209 bl x264_sub8x4_dct_neon
210 bl x264_sub8x4_dct_neon
211 sub x1, x1, #8*FENC_STRIDE-8
212 sub x2, x2, #8*FDEC_STRIDE-8
213 bl x264_sub8x4_dct_neon
214 bl x264_sub8x4_dct_neon
217 bl x264_sub8x4_dct_neon
218 bl x264_sub8x4_dct_neon
219 sub x1, x1, #8*FENC_STRIDE-8
220 sub x2, x2, #8*FDEC_STRIDE-8
221 bl x264_sub8x4_dct_neon
223 b x264_sub8x4_dct_neon
228 SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
229 SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
230 SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
231 SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
233 SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
234 SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
236 SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
237 sshr v23.8h, v21.8h, #1
238 sshr v18.8h, v16.8h, #1
239 add v23.8h, v23.8h, v21.8h
240 add v18.8h, v18.8h, v16.8h
241 sub v30.8h, v30.8h, v23.8h
242 sub v29.8h, v29.8h, v18.8h
244 SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
245 sshr v22.8h, v20.8h, #1
246 sshr v19.8h, v17.8h, #1
247 add v22.8h, v22.8h, v20.8h
248 add v19.8h, v19.8h, v17.8h
249 add v22.8h, v28.8h, v22.8h
250 add v31.8h, v31.8h, v19.8h
252 SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
253 SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
254 SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
255 SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
258 function x264_sub8x8_dct8_neon, export=1
261 ld1 {v16.8b}, [x1], x3
262 ld1 {v17.8b}, [x2], x4
263 ld1 {v18.8b}, [x1], x3
264 ld1 {v19.8b}, [x2], x4
265 usubl v0.8h, v16.8b, v17.8b
266 ld1 {v20.8b}, [x1], x3
267 ld1 {v21.8b}, [x2], x4
268 usubl v1.8h, v18.8b, v19.8b
269 ld1 {v22.8b}, [x1], x3
270 ld1 {v23.8b}, [x2], x4
271 usubl v2.8h, v20.8b, v21.8b
272 ld1 {v24.8b}, [x1], x3
273 ld1 {v25.8b}, [x2], x4
274 usubl v3.8h, v22.8b, v23.8b
275 ld1 {v26.8b}, [x1], x3
276 ld1 {v27.8b}, [x2], x4
277 usubl v4.8h, v24.8b, v25.8b
278 ld1 {v28.8b}, [x1], x3
279 ld1 {v29.8b}, [x2], x4
280 usubl v5.8h, v26.8b, v27.8b
281 ld1 {v30.8b}, [x1], x3
282 ld1 {v31.8b}, [x2], x4
283 usubl v6.8h, v28.8b, v29.8b
284 usubl v7.8h, v30.8b, v31.8b
287 transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
290 st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
291 st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
295 function x264_sub16x16_dct8_neon, export=1
297 bl X(x264_sub8x8_dct8_neon)
298 sub x1, x1, #FENC_STRIDE*8 - 8
299 sub x2, x2, #FDEC_STRIDE*8 - 8
300 bl X(x264_sub8x8_dct8_neon)
303 bl X(x264_sub8x8_dct8_neon)
305 sub x1, x1, #FENC_STRIDE*8 - 8
306 sub x2, x2, #FDEC_STRIDE*8 - 8
307 b X(x264_sub8x8_dct8_neon)
311 // First part of IDCT (minus final SUMSUB_BA)
312 .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
313 SUMSUB_AB \d4, \d5, \d0, \d2
320 function x264_add4x4_idct_neon, export=1
322 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
324 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
325 ld1 {v28.s}[0], [x0], x2
326 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
327 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
329 transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
331 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
332 ld1 {v29.s}[0], [x0], x2
333 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
334 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
336 srshr v0.4h, v0.4h, #6
337 srshr v1.4h, v1.4h, #6
338 ld1 {v31.s}[0], [x0], x2
339 srshr v2.4h, v2.4h, #6
340 srshr v3.4h, v3.4h, #6
341 ld1 {v30.s}[0], [x0], x2
343 sub x0, x0, x2, lsl #2
344 uaddw v0.8h, v0.8h, v28.8b
345 uaddw v1.8h, v1.8h, v29.8b
346 uaddw v2.8h, v2.8h, v30.8b
347 uaddw v3.8h, v3.8h, v31.8b
353 st1 {v0.s}[0], [x0], x2
354 st1 {v1.s}[0], [x0], x2
355 st1 {v3.s}[0], [x0], x2
356 st1 {v2.s}[0], [x0], x2
360 function x264_add8x4_idct_neon, export=1
361 ld1 {v0.8h,v1.8h}, [x1], #32
362 ld1 {v2.8h,v3.8h}, [x1], #32
363 transpose v20.2d, v21.2d, v0.2d, v2.2d
364 transpose v22.2d, v23.2d, v1.2d, v3.2d
365 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
366 SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
367 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
369 transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
371 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
372 SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
373 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
375 srshr v0.8h, v0.8h, #6
376 ld1 {v28.8b}, [x0], x2
377 srshr v1.8h, v1.8h, #6
378 ld1 {v29.8b}, [x0], x2
379 srshr v2.8h, v2.8h, #6
380 ld1 {v30.8b}, [x0], x2
381 srshr v3.8h, v3.8h, #6
382 ld1 {v31.8b}, [x0], x2
384 sub x0, x0, x2, lsl #2
385 uaddw v0.8h, v0.8h, v28.8b
386 uaddw v1.8h, v1.8h, v29.8b
387 uaddw v2.8h, v2.8h, v30.8b
388 uaddw v3.8h, v3.8h, v31.8b
392 st1 {v0.8b}, [x0], x2
394 st1 {v1.8b}, [x0], x2
396 st1 {v2.8b}, [x0], x2
397 st1 {v3.8b}, [x0], x2
401 function x264_add8x8_idct_neon, export=1
404 bl X(x264_add8x4_idct_neon)
406 b X(x264_add8x4_idct_neon)
409 function x264_add16x16_idct_neon, export=1
412 bl X(x264_add8x4_idct_neon)
413 bl X(x264_add8x4_idct_neon)
414 sub x0, x0, #8*FDEC_STRIDE-8
415 bl X(x264_add8x4_idct_neon)
416 bl X(x264_add8x4_idct_neon)
418 bl X(x264_add8x4_idct_neon)
419 bl X(x264_add8x4_idct_neon)
420 sub x0, x0, #8*FDEC_STRIDE-8
421 bl X(x264_add8x4_idct_neon)
423 b X(x264_add8x4_idct_neon)
427 SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
429 ld1 {v22.8h,v23.8h}, [x1], #32
431 SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
432 SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
433 SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
434 SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
435 SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
437 SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
438 SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
440 SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
441 SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
443 SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
444 SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
445 SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
446 SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
449 function x264_add8x8_idct8_neon, export=1
451 ld1 {v16.8h,v17.8h}, [x1], #32
452 ld1 {v18.8h,v19.8h}, [x1], #32
453 ld1 {v20.8h,v21.8h}, [x1], #32
457 transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
461 ld1 {v0.8b}, [x0], x2
462 srshr v16.8h, v16.8h, #6
463 ld1 {v1.8b}, [x0], x2
464 srshr v17.8h, v17.8h, #6
465 ld1 {v2.8b}, [x0], x2
466 srshr v18.8h, v18.8h, #6
467 ld1 {v3.8b}, [x0], x2
468 srshr v19.8h, v19.8h, #6
469 ld1 {v4.8b}, [x0], x2
470 srshr v20.8h, v20.8h, #6
471 ld1 {v5.8b}, [x0], x2
472 srshr v21.8h, v21.8h, #6
473 ld1 {v6.8b}, [x0], x2
474 srshr v22.8h, v22.8h, #6
475 ld1 {v7.8b}, [x0], x2
476 srshr v23.8h, v23.8h, #6
477 sub x0, x0, x2, lsl #3
479 uaddw v16.8h, v16.8h, v0.8b
480 uaddw v17.8h, v17.8h, v1.8b
481 uaddw v18.8h, v18.8h, v2.8b
485 uaddw v19.8h, v19.8h, v3.8b
486 st1 {v0.8b}, [x0], x2
487 uaddw v20.8h, v20.8h, v4.8b
488 st1 {v1.8b}, [x0], x2
489 uaddw v21.8h, v21.8h, v5.8b
490 st1 {v2.8b}, [x0], x2
493 uaddw v22.8h, v22.8h, v6.8b
494 uaddw v23.8h, v23.8h, v7.8b
495 st1 {v3.8b}, [x0], x2
497 st1 {v4.8b}, [x0], x2
500 st1 {v5.8b}, [x0], x2
501 st1 {v6.8b}, [x0], x2
502 st1 {v7.8b}, [x0], x2
506 function x264_add16x16_idct8_neon, export=1
508 bl X(x264_add8x8_idct8_neon)
509 sub x0, x0, #8*FDEC_STRIDE-8
510 bl X(x264_add8x8_idct8_neon)
512 bl X(x264_add8x8_idct8_neon)
513 sub x0, x0, #8*FDEC_STRIDE-8
515 b X(x264_add8x8_idct8_neon)
518 function x264_add8x8_idct_dc_neon, export=1
521 ld1 {v0.8b}, [x0], x2
522 srshr v16.4h, v16.4h, #6
523 ld1 {v1.8b}, [x0], x2
526 ld1 {v2.8b}, [x0], x2
529 ld1 {v3.8b}, [x0], x2
530 trn1 v20.2d, v20.2d, v21.2d
531 ld1 {v4.8b}, [x0], x2
532 trn1 v21.2d, v22.2d, v23.2d
533 ld1 {v5.8b}, [x0], x2
535 ld1 {v6.8b}, [x0], x2
537 ld1 {v7.8b}, [x0], x2
539 sub x0, x0, #8*FDEC_STRIDE
541 sqxtun v20.8b, v20.8h
542 sqxtun v21.8b, v21.8h
543 sqxtun v22.8b, v22.8h
544 sqxtun v23.8b, v23.8h
546 uqadd v0.8b, v0.8b, v20.8b
547 uqadd v1.8b, v1.8b, v20.8b
548 uqadd v2.8b, v2.8b, v20.8b
549 uqadd v3.8b, v3.8b, v20.8b
550 uqadd v4.8b, v4.8b, v21.8b
551 uqadd v5.8b, v5.8b, v21.8b
552 uqadd v6.8b, v6.8b, v21.8b
553 uqadd v7.8b, v7.8b, v21.8b
554 uqsub v0.8b, v0.8b, v22.8b
555 uqsub v1.8b, v1.8b, v22.8b
556 uqsub v2.8b, v2.8b, v22.8b
557 uqsub v3.8b, v3.8b, v22.8b
558 uqsub v4.8b, v4.8b, v23.8b
559 uqsub v5.8b, v5.8b, v23.8b
560 uqsub v6.8b, v6.8b, v23.8b
561 uqsub v7.8b, v7.8b, v23.8b
563 st1 {v0.8b}, [x0], x2
564 st1 {v1.8b}, [x0], x2
565 st1 {v2.8b}, [x0], x2
566 st1 {v3.8b}, [x0], x2
567 st1 {v4.8b}, [x0], x2
568 st1 {v5.8b}, [x0], x2
569 st1 {v6.8b}, [x0], x2
570 st1 {v7.8b}, [x0], x2
574 .macro ADD16x4_IDCT_DC dc
575 ld1 {v4.16b}, [x0], x3
578 ld1 {v5.16b}, [x0], x3
581 ld1 {v6.16b}, [x0], x3
582 trn1 v24.2d, v24.2d, v25.2d
583 ld1 {v7.16b}, [x0], x3
584 trn1 v25.2d, v26.2d, v27.2d
588 sqxtun v20.8b, v24.8h
589 sqxtun v21.8b, v26.8h
590 sqxtun2 v20.16b, v25.8h
591 sqxtun2 v21.16b, v27.8h
593 uqadd v4.16b, v4.16b, v20.16b
594 uqadd v5.16b, v5.16b, v20.16b
595 uqadd v6.16b, v6.16b, v20.16b
596 uqadd v7.16b, v7.16b, v20.16b
598 uqsub v4.16b, v4.16b, v21.16b
599 uqsub v5.16b, v5.16b, v21.16b
600 uqsub v6.16b, v6.16b, v21.16b
601 st1 {v4.16b}, [x2], x3
602 uqsub v7.16b, v7.16b, v21.16b
603 st1 {v5.16b}, [x2], x3
604 st1 {v6.16b}, [x2], x3
605 st1 {v7.16b}, [x2], x3
608 function x264_add16x16_idct_dc_neon, export=1
612 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
613 srshr v0.4h, v0.4h, #6
614 srshr v1.4h, v1.4h, #6
617 srshr v2.4h, v2.4h, #6
619 srshr v3.4h, v3.4h, #6
625 .macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
626 ld1 {\t0\().8b}, [x1], x3
627 ld1 {\t1\().8b}, [x2], x4
628 ld1 {\t2\().8b}, [x1], x3
629 ld1 {\t3\().8b}, [x2], x4
630 usubl \t0\().8h, \t0\().8b, \t1\().8b
631 ld1 {\t4\().8b}, [x1], x3
632 ld1 {\t5\().8b}, [x2], x4
633 usubl \t1\().8h, \t2\().8b, \t3\().8b
634 ld1 {\t6\().8b}, [x1], x3
635 ld1 {\t7\().8b}, [x2], x4
636 add \dst\().8h, \t0\().8h, \t1\().8h
637 usubl \t2\().8h, \t4\().8b, \t5\().8b
638 usubl \t3\().8h, \t6\().8b, \t7\().8b
639 add \dst\().8h, \dst\().8h, \t2\().8h
640 add \dst\().8h, \dst\().8h, \t3\().8h
643 function x264_sub8x8_dct_dc_neon, export=1
647 sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
648 sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
650 transpose v2.2d, v3.2d, v0.2d, v1.2d
651 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
652 transpose v2.2d, v3.2d, v0.2d, v1.2d
653 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
654 transpose v2.2d, v3.2d, v0.2d, v1.2d
656 addp v0.8h, v2.8h, v3.8h
657 addp v0.8h, v0.8h, v0.8h
663 function x264_sub8x16_dct_dc_neon, export=1
666 sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
667 sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
668 sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
669 sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
671 addp v4.8h, v0.8h, v2.8h
672 addp v5.8h, v1.8h, v3.8h
674 transpose v2.4s, v3.4s, v4.4s, v5.4s
675 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
677 transpose v2.4s, v3.4s, v0.4s, v1.4s
678 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
680 transpose v2.2d, v3.2d, v0.2d, v1.2d
681 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
683 trn1 v2.2d, v0.2d, v1.2d
684 trn2 v3.2d, v1.2d, v0.2d
686 addp v0.8h, v2.8h, v3.8h
692 function x264_zigzag_interleave_8x8_cavlc_neon, export=1
695 ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
696 ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
697 umax v16.8h, v0.8h, v4.8h
698 umax v17.8h, v1.8h, v5.8h
699 umax v18.8h, v2.8h, v6.8h
700 umax v19.8h, v3.8h, v7.8h
701 st1 {v0.8h}, [x0], #16
702 st1 {v4.8h}, [x0], #16
703 umaxp v16.8h, v16.8h, v17.8h
704 umaxp v18.8h, v18.8h, v19.8h
705 st1 {v1.8h}, [x0], #16
706 st1 {v5.8h}, [x0], #16
707 umaxp v16.8h, v16.8h, v18.8h
708 st1 {v2.8h}, [x0], #16
709 st1 {v6.8h}, [x0], #16
710 cmhi v16.4s, v16.4s, v31.4s
711 st1 {v3.8h}, [x0], #16
712 and v16.16b, v16.16b, v31.16b
713 st1 {v7.8h}, [x0], #16
714 st1 {v16.b}[0], [x2], #1
715 st1 {v16.b}[4], [x2], x3
716 st1 {v16.b}[8], [x2], #1
717 st1 {v16.b}[12], [x2]
721 function x264_zigzag_scan_4x4_frame_neon, export=1
722 movrel x2, scan4x4_frame
723 ld1 {v0.16b,v1.16b}, [x1]
724 ld1 {v16.16b,v17.16b}, [x2]
725 tbl v2.16b, {v0.16b,v1.16b}, v16.16b
726 tbl v3.16b, {v0.16b,v1.16b}, v17.16b
727 st1 {v2.16b,v3.16b}, [x0]
731 .macro zigzag_sub_4x4 f ac
732 function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
737 ld1 {v0.s}[0], [x1], x9
738 ld1 {v0.s}[1], [x1], x9
739 ld1 {v0.s}[2], [x1], x9
740 ld1 {v0.s}[3], [x1], x9
742 ld1 {v1.s}[0], [x2], x4
743 ld1 {v1.s}[1], [x2], x4
744 ld1 {v1.s}[2], [x2], x4
745 ld1 {v1.s}[3], [x2], x4
746 tbl v2.16b, {v0.16b}, v16.16b
747 tbl v3.16b, {v1.16b}, v16.16b
748 st1 {v0.s}[0], [x6], x4
749 usubl v4.8h, v2.8b, v3.8b
756 usubl2 v5.8h, v2.16b, v3.16b
757 st1 {v0.s}[1], [x6], x4
758 umax v6.8h, v4.8h, v5.8h
760 st1 {v0.s}[2], [x6], x4
762 st1 {v0.s}[3], [x6], x4
764 st1 {v4.8h,v5.8h}, [x0]
771 zigzag_sub_4x4 field, ac
773 zigzag_sub_4x4 frame, ac
775 function x264_zigzag_scan_4x4_field_neon, export=1
776 movrel x2, scan4x4_field
777 ld1 {v0.8h,v1.8h}, [x1]
779 tbl v0.16b, {v0.16b}, v16.16b
780 st1 {v0.8h,v1.8h}, [x0]
784 function x264_zigzag_scan_8x8_frame_neon, export=1
785 movrel x2, scan8x8_frame
786 ld1 {v0.8h,v1.8h}, [x1], #32
787 ld1 {v2.8h,v3.8h}, [x1], #32
788 ld1 {v4.8h,v5.8h}, [x1], #32
789 ld1 {v6.8h,v7.8h}, [x1]
790 ld1 {v16.16b,v17.16b}, [x2], #32
791 ld1 {v18.16b,v19.16b}, [x2], #32
792 ld1 {v20.16b,v21.16b}, [x2], #32
793 ld1 {v22.16b,v23.16b}, [x2], #32
794 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
795 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
796 tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
797 tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
798 tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
799 tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
800 tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
801 tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
802 mov v25.h[6], v4.h[0]
803 mov v25.h[7], v5.h[0]
804 mov v26.h[0], v4.h[1]
805 mov v27.h[4], v7.h[0]
806 mov v28.h[7], v4.h[4]
807 mov v29.h[7], v3.h[6]
808 mov v30.h[0], v2.h[7]
809 mov v30.h[1], v3.h[7]
810 st1 {v24.8h,v25.8h}, [x0], #32
811 st1 {v26.8h,v27.8h}, [x0], #32
812 st1 {v28.8h,v29.8h}, [x0], #32
813 st1 {v30.8h,v31.8h}, [x0]
817 #define Z(z) 2*(z), 2*(z)+1
818 #define T(x,y) Z(x*8+y)
819 const scan8x8_frame, align=5
820 .byte T(0,0), T(1,0), T(0,1), T(0,2)
821 .byte T(1,1), T(2,0), T(3,0), T(2,1)
822 .byte T(1,2), T(0,3), T(0,4), T(1,3)
823 .byte T(2,2), T(3,1), T(4,0), T(5,0)
824 .byte T(4,1), T(3,2), T(2,3), T(1,4)
825 .byte T(0,5), T(0,6), T(1,5), T(2,4)
827 #define T(x,y) Z((x-3)*8+y)
828 .byte T(3,3), T(4,2), T(5,1), T(6,0)
829 .byte T(7,0), T(6,1), T(5,2), T(4,3)
831 #define T(x,y) Z((x-0)*8+y)
832 .byte T(3,4), T(2,5), T(1,6), T(0,7)
833 .byte T(1,7), T(2,6), T(3,5), T(4,4)
835 #define T(x,y) Z((x-4)*8+y)
836 .byte T(5,3), T(6,2), T(7,1), T(7,2)
837 .byte T(6,3), T(5,4), T(4,5), T(3,6)
838 .byte T(2,7), T(3,7), T(4,6), T(5,5)
839 .byte T(6,4), T(7,3), T(7,4), T(6,5)
840 .byte T(5,6), T(4,7), T(5,7), T(6,6)
841 .byte T(7,5), T(7,6), T(6,7), T(7,7)
844 function x264_zigzag_scan_8x8_field_neon, export=1
845 movrel x2, scan8x8_field
846 ld1 {v0.8h,v1.8h}, [x1], #32
847 ld1 {v2.8h,v3.8h}, [x1], #32
848 ld1 {v4.8h,v5.8h}, [x1], #32
849 ld1 {v6.8h,v7.8h}, [x1]
850 ld1 {v16.16b,v17.16b}, [x2], #32
851 ld1 {v18.16b,v19.16b}, [x2], #32
852 ld1 {v20.16b,v21.16b}, [x2], #32
854 ext v31.16b, v7.16b, v7.16b, #4
855 tbl v24.16b, {v0.16b,v1.16b}, v16.16b
856 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
857 tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
858 tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
859 tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
860 tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
861 tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
862 ext v31.16b, v6.16b, v31.16b, #12
863 st1 {v24.8h,v25.8h}, [x0], #32
864 st1 {v26.8h,v27.8h}, [x0], #32
865 st1 {v28.8h,v29.8h}, [x0], #32
866 st1 {v30.8h,v31.8h}, [x0]
870 .macro zigzag_sub8x8 f
871 function x264_zigzag_sub_8x8_\f\()_neon, export=1
876 ld1 {v0.d}[0], [x1], x5
877 ld1 {v0.d}[1], [x1], x5
878 ld1 {v1.d}[0], [x1], x5
879 ld1 {v1.d}[1], [x1], x5
880 ld1 {v2.d}[0], [x1], x5
881 ld1 {v2.d}[1], [x1], x5
882 ld1 {v3.d}[0], [x1], x5
884 ld1 {v4.d}[0], [x2], x6
885 ld1 {v4.d}[1], [x2], x6
886 ld1 {v5.d}[0], [x2], x6
887 ld1 {v5.d}[1], [x2], x6
888 ld1 {v6.d}[0], [x2], x6
889 ld1 {v6.d}[1], [x2], x6
890 ld1 {v7.d}[0], [x2], x6
892 ld1 {v16.16b,v17.16b}, [x4], #32
893 ld1 {v18.16b,v19.16b}, [x4], #32
894 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
895 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
896 tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
897 tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
898 tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
899 tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
900 tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
901 tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
902 usubl v4.8h, v24.8b, v28.8b
903 usubl2 v5.8h, v24.16b, v28.16b
904 usubl v6.8h, v25.8b, v29.8b
905 usubl2 v7.8h, v25.16b, v29.16b
906 usubl v16.8h, v26.8b, v30.8b
907 usubl2 v17.8h, v26.16b, v30.16b
908 usubl v18.8h, v27.8b, v31.8b
909 usubl2 v19.8h, v27.16b, v31.16b
910 umax v20.8h, v4.8h, v5.8h
911 umax v21.8h, v6.8h, v7.8h
912 umax v22.8h, v16.8h, v17.8h
913 umax v23.8h, v18.8h, v19.8h
914 umax v20.8h, v20.8h, v21.8h
915 umax v21.8h, v22.8h, v23.8h
916 umax v20.8h, v20.8h, v21.8h
918 st1 {v0.d}[0], [x7], x6
919 st1 {v0.d}[1], [x7], x6
920 st1 {v1.d}[0], [x7], x6
921 st1 {v1.d}[1], [x7], x6
922 st1 {v2.d}[0], [x7], x6
923 st1 {v2.d}[1], [x7], x6
924 st1 {v3.d}[0], [x7], x6
926 st1 {v4.8h,v5.8h}, [x0], #32
927 st1 {v6.8h,v7.8h}, [x0], #32
928 st1 {v16.8h,v17.8h}, [x0], #32
929 st1 {v18.8h,v19.8h}, [x0]
941 #define T(x,y) Z(x*8+y)
942 const scan8x8_field, align=5
943 .byte T(0,0), T(0,1), T(0,2), T(1,0)
944 .byte T(1,1), T(0,3), T(0,4), T(1,2)
945 .byte T(2,0), T(1,3), T(0,5), T(0,6)
946 .byte T(0,7), T(1,4), T(2,1), T(3,0)
948 #define T(x,y) Z((x-1)*8+y)
949 .byte T(2,2), T(1,5), T(1,6), T(1,7)
950 .byte T(2,3), T(3,1), T(4,0), T(3,2)
952 #define T(x,y) Z((x-2)*8+y)
953 .byte T(2,4), T(2,5), T(2,6), T(2,7)
954 .byte T(3,3), T(4,1), T(5,0), T(4,2)
956 #define T(x,y) Z((x-3)*8+y)
957 .byte T(3,4), T(3,5), T(3,6), T(3,7)
958 .byte T(4,3), T(5,1), T(6,0), T(5,2)
960 #define T(x,y) Z((x-4)*8+y)
961 .byte T(4,4), T(4,5), T(4,6), T(4,7)
962 .byte T(5,3), T(6,1), T(6,2), T(5,4)
964 #define T(x,y) Z((x-5)*8+y)
965 .byte T(5,5), T(5,6), T(5,7), T(6,3)
966 .byte T(7,0), T(7,1), T(6,4), T(6,5)
972 const sub8x8_frame, align=5
973 .byte T(0,0), T(1,0), T(0,1), T(0,2)
974 .byte T(1,1), T(2,0), T(3,0), T(2,1)
975 .byte T(1,2), T(0,3), T(0,4), T(1,3)
976 .byte T(2,2), T(3,1), T(4,0), T(5,0)
977 .byte T(4,1), T(3,2), T(2,3), T(1,4)
978 .byte T(0,5), T(0,6), T(1,5), T(2,4)
979 .byte T(3,3), T(4,2), T(5,1), T(6,0)
980 .byte T(7,0), T(6,1), T(5,2), T(4,3)
981 .byte T(3,4), T(2,5), T(1,6), T(0,7)
982 .byte T(1,7), T(2,6), T(3,5), T(4,4)
983 .byte T(5,3), T(6,2), T(7,1), T(7,2)
984 .byte T(6,3), T(5,4), T(4,5), T(3,6)
985 .byte T(2,7), T(3,7), T(4,6), T(5,5)
986 .byte T(6,4), T(7,3), T(7,4), T(6,5)
987 .byte T(5,6), T(4,7), T(5,7), T(6,6)
988 .byte T(7,5), T(7,6), T(6,7), T(7,7)
991 const sub8x8_field, align=5
992 .byte T(0,0), T(0,1), T(0,2), T(1,0)
993 .byte T(1,1), T(0,3), T(0,4), T(1,2)
994 .byte T(2,0), T(1,3), T(0,5), T(0,6)
995 .byte T(0,7), T(1,4), T(2,1), T(3,0)
996 .byte T(2,2), T(1,5), T(1,6), T(1,7)
997 .byte T(2,3), T(3,1), T(4,0), T(3,2)
998 .byte T(2,4), T(2,5), T(2,6), T(2,7)
999 .byte T(3,3), T(4,1), T(5,0), T(4,2)
1000 .byte T(3,4), T(3,5), T(3,6), T(3,7)
1001 .byte T(4,3), T(5,1), T(6,0), T(5,2)
1002 .byte T(4,4), T(4,5), T(4,6), T(4,7)
1003 .byte T(5,3), T(6,1), T(6,2), T(5,4)
1004 .byte T(5,5), T(5,6), T(5,7), T(6,3)
1005 .byte T(7,0), T(7,1), T(6,4), T(6,5)
1006 .byte T(6,6), T(6,7), T(7,2), T(7,3)
1007 .byte T(7,4), T(7,5), T(7,6), T(7,7)