1 /*****************************************************************************
2 * predict.S: arm intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2016 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
8 * Martin Storsjo <martin@martin.st>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
33 p16weight: .short 1,2,3,4,5,6,7,8
37 .macro ldcol.8 rd, rs, rt, n=8, hi=0
38 .if \n == 8 || \hi == 0
39 vld1.8 {\rd[0]}, [\rs], \rt
40 vld1.8 {\rd[1]}, [\rs], \rt
41 vld1.8 {\rd[2]}, [\rs], \rt
42 vld1.8 {\rd[3]}, [\rs], \rt
44 .if \n == 8 || \hi == 1
45 vld1.8 {\rd[4]}, [\rs], \rt
46 vld1.8 {\rd[5]}, [\rs], \rt
47 vld1.8 {\rd[6]}, [\rs], \rt
48 vld1.8 {\rd[7]}, [\rs], \rt
52 .macro ldcol.16 rd1, rd2, rs, rt, ru
53 add \ru, \rs, \rt, lsl #3
54 vld1.8 {\rd1[0]}, [\rs], \rt
55 vld1.8 {\rd2[0]}, [\ru], \rt
56 vld1.8 {\rd1[1]}, [\rs], \rt
57 vld1.8 {\rd2[1]}, [\ru], \rt
58 vld1.8 {\rd1[2]}, [\rs], \rt
59 vld1.8 {\rd2[2]}, [\ru], \rt
60 vld1.8 {\rd1[3]}, [\rs], \rt
61 vld1.8 {\rd2[3]}, [\ru], \rt
62 vld1.8 {\rd1[4]}, [\rs], \rt
63 vld1.8 {\rd2[4]}, [\ru], \rt
64 vld1.8 {\rd1[5]}, [\rs], \rt
65 vld1.8 {\rd2[5]}, [\ru], \rt
66 vld1.8 {\rd1[6]}, [\rs], \rt
67 vld1.8 {\rd2[6]}, [\ru], \rt
68 vld1.8 {\rd1[7]}, [\rs], \rt
69 vld1.8 {\rd2[7]}, [\ru], \rt
72 .macro add16x8 dq, dl, dh, rl, rh
73 vaddl.u8 \dq, \rl, \rh
74 vadd.u16 \dl, \dl, \dh
75 vpadd.u16 \dl, \dl, \dl
76 vpadd.u16 \dl, \dl, \dl
80 // because gcc doesn't believe in using the free shift in add
81 function x264_predict_4x4_h_armv6
82 ldrb r1, [r0, #0*FDEC_STRIDE-1]
83 ldrb r2, [r0, #1*FDEC_STRIDE-1]
84 ldrb r3, [r0, #2*FDEC_STRIDE-1]
85 ldrb ip, [r0, #3*FDEC_STRIDE-1]
86 add r1, r1, r1, lsl #8
87 add r2, r2, r2, lsl #8
88 add r3, r3, r3, lsl #8
89 add ip, ip, ip, lsl #8
90 add r1, r1, r1, lsl #16
91 str r1, [r0, #0*FDEC_STRIDE]
92 add r2, r2, r2, lsl #16
93 str r2, [r0, #1*FDEC_STRIDE]
94 add r3, r3, r3, lsl #16
95 str r3, [r0, #2*FDEC_STRIDE]
96 add ip, ip, ip, lsl #16
97 str ip, [r0, #3*FDEC_STRIDE]
101 function x264_predict_4x4_v_armv6
102 ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
103 str r1, [r0, #0 + 0 * FDEC_STRIDE]
104 str r1, [r0, #0 + 1 * FDEC_STRIDE]
105 str r1, [r0, #0 + 2 * FDEC_STRIDE]
106 str r1, [r0, #0 + 3 * FDEC_STRIDE]
110 function x264_predict_4x4_dc_armv6
112 ldr r1, [r0, #-FDEC_STRIDE]
113 ldrb r2, [r0, #0*FDEC_STRIDE-1]
114 ldrb r3, [r0, #1*FDEC_STRIDE-1]
117 ldrb ip, [r0, #2*FDEC_STRIDE-1]
119 ldrb r3, [r0, #3*FDEC_STRIDE-1]
124 add r1, r1, r1, lsl #8
125 add r1, r1, r1, lsl #16
126 str r1, [r0, #0*FDEC_STRIDE]
127 str r1, [r0, #1*FDEC_STRIDE]
128 str r1, [r0, #2*FDEC_STRIDE]
129 str r1, [r0, #3*FDEC_STRIDE]
133 function x264_predict_4x4_dc_top_neon
134 mov r12, #FDEC_STRIDE
135 sub r1, r0, #FDEC_STRIDE
136 vld1.32 d1[], [r1,:32]
141 vst1.32 d1[0], [r0,:32], r12
142 vst1.32 d1[0], [r0,:32], r12
143 vst1.32 d1[0], [r0,:32], r12
144 vst1.32 d1[0], [r0,:32], r12
148 // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
149 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
162 function x264_predict_4x4_ddr_armv6
163 ldr r1, [r0, # -FDEC_STRIDE]
164 ldrb r2, [r0, # -FDEC_STRIDE-1]
165 ldrb r3, [r0, #0*FDEC_STRIDE-1]
167 add r2, r2, r1, lsl #8
168 ldrb r4, [r0, #1*FDEC_STRIDE-1]
169 add r3, r3, r2, lsl #8
170 ldrb r5, [r0, #2*FDEC_STRIDE-1]
171 ldrb r6, [r0, #3*FDEC_STRIDE-1]
172 add r4, r4, r3, lsl #8
173 add r5, r5, r4, lsl #8
174 add r6, r6, r5, lsl #8
176 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
177 str r1, [r0, #0*FDEC_STRIDE]
182 add r2, r2, r4, lsr #24
183 str r2, [r0, #1*FDEC_STRIDE]
184 add r3, r3, r4, lsr #16
185 str r3, [r0, #2*FDEC_STRIDE]
186 add r5, r5, r4, lsr #8
187 str r5, [r0, #3*FDEC_STRIDE]
191 function x264_predict_4x4_ddl_neon
194 vld1.64 {d0}, [r0], ip
196 vext.8 d1, d0, d0, #1
197 vext.8 d2, d0, d3, #2
200 vst1.32 {d0[0]}, [r0,:32], ip
201 vext.8 d1, d0, d0, #1
202 vext.8 d2, d0, d0, #2
203 vst1.32 {d1[0]}, [r0,:32], ip
204 vext.8 d3, d0, d0, #3
205 vst1.32 {d2[0]}, [r0,:32], ip
206 vst1.32 {d3[0]}, [r0,:32], ip
210 function x264_predict_8x8_dc_neon
212 ldrd r2, r3, [r1, #8]
214 ldrd r4, r5, [r1, #16]
219 usada8 r2, r4, ip, r2
221 usada8 r3, r5, ip, r3
229 vst1.64 {d0}, [r0,:64], ip
234 function x264_predict_8x8_h_neon
240 vst1.64 {d0}, [r0,:64], ip
242 vst1.64 {d1}, [r0,:64], ip
244 vst1.64 {d2}, [r0,:64], ip
246 vst1.64 {d3}, [r0,:64], ip
248 vst1.64 {d4}, [r0,:64], ip
250 vst1.64 {d5}, [r0,:64], ip
252 vst1.64 {d6}, [r0,:64], ip
253 vst1.64 {d7}, [r0,:64], ip
257 function x264_predict_8x8_v_neon
259 mov r12, #FDEC_STRIDE
260 vld1.8 {d0}, [r1,:64]
262 vst1.8 {d0}, [r0,:64], r12
267 function x264_predict_8x8_ddl_neon
269 vld1.8 {d0, d1}, [r1,:128]
272 vext.8 q8, q3, q0, #15
273 vext.8 q2, q0, q1, #1
275 mov r12, #FDEC_STRIDE
277 vext.8 d2, d0, d1, #1
278 vext.8 d3, d0, d1, #2
279 vst1.8 d2, [r0,:64], r12
280 vext.8 d2, d0, d1, #3
281 vst1.8 d3, [r0,:64], r12
282 vext.8 d3, d0, d1, #4
283 vst1.8 d2, [r0,:64], r12
284 vext.8 d2, d0, d1, #5
285 vst1.8 d3, [r0,:64], r12
286 vext.8 d3, d0, d1, #6
287 vst1.8 d2, [r0,:64], r12
288 vext.8 d2, d0, d1, #7
289 vst1.8 d3, [r0,:64], r12
290 vst1.8 d2, [r0,:64], r12
291 vst1.8 d1, [r0,:64], r12
295 function x264_predict_8x8_ddr_neon
296 vld1.8 {d0-d3}, [r1,:128]
297 vext.8 q2, q0, q1, #7
298 vext.8 q3, q0, q1, #9
304 add r0, #7*FDEC_STRIDE
305 mov r12, #-1*FDEC_STRIDE
307 vext.8 d2, d0, d1, #1
308 vst1.8 {d0}, [r0,:64], r12
309 vext.8 d4, d0, d1, #2
310 vst1.8 {d2}, [r0,:64], r12
311 vext.8 d5, d0, d1, #3
312 vst1.8 {d4}, [r0,:64], r12
313 vext.8 d4, d0, d1, #4
314 vst1.8 {d5}, [r0,:64], r12
315 vext.8 d5, d0, d1, #5
316 vst1.8 {d4}, [r0,:64], r12
317 vext.8 d4, d0, d1, #6
318 vst1.8 {d5}, [r0,:64], r12
319 vext.8 d5, d0, d1, #7
320 vst1.8 {d4}, [r0,:64], r12
321 vst1.8 {d5}, [r0,:64], r12
325 function x264_predict_8x8_vl_neon
327 mov r12, #FDEC_STRIDE
329 vld1.8 {d0, d1}, [r1,:128]
330 vext.8 q1, q1, q0, #15
331 vext.8 q2, q0, q2, #1
338 vext.8 d2, d0, d1, #1
339 vst1.8 {d6}, [r0,:64], r12
340 vext.8 d3, d6, d7, #1
341 vst1.8 {d2}, [r0,:64], r12
342 vext.8 d2, d0, d1, #2
343 vst1.8 {d3}, [r0,:64], r12
344 vext.8 d3, d6, d7, #2
345 vst1.8 {d2}, [r0,:64], r12
346 vext.8 d2, d0, d1, #3
347 vst1.8 {d3}, [r0,:64], r12
348 vext.8 d3, d6, d7, #3
349 vst1.8 {d2}, [r0,:64], r12
350 vext.8 d2, d0, d1, #4
351 vst1.8 {d3}, [r0,:64], r12
352 vst1.8 {d2}, [r0,:64], r12
356 function x264_predict_8x8_vr_neon
358 mov r12, #FDEC_STRIDE
359 vld1.8 {d4,d5}, [r1,:64]
361 vext.8 q1, q2, q2, #14
362 vext.8 q0, q2, q2, #15
370 vst1.8 {d5}, [r0,:64], r12
372 vst1.8 {d1}, [r0,:64], r12
373 vext.8 d6, d0, d5, #7
374 vext.8 d3, d2, d1, #7
375 vst1.8 {d6}, [r0,:64], r12
376 vst1.8 {d3}, [r0,:64], r12
377 vext.8 d6, d0, d5, #6
378 vext.8 d3, d2, d1, #6
379 vst1.8 {d6}, [r0,:64], r12
380 vst1.8 {d3}, [r0,:64], r12
381 vext.8 d6, d0, d5, #5
382 vext.8 d3, d2, d1, #5
383 vst1.8 {d6}, [r0,:64], r12
384 vst1.8 {d3}, [r0,:64], r12
388 function x264_predict_8x8_hd_neon
389 mov r12, #FDEC_STRIDE
393 vext.8 q3, q1, q1, #1
394 vext.8 q2, q1, q1, #2
403 vext.8 d2, d0, d1, #6
404 vext.8 d3, d0, d1, #4
405 vst1.8 {d2}, [r0,:64], r12
406 vext.8 d2, d0, d1, #2
407 vst1.8 {d3}, [r0,:64], r12
408 vst1.8 {d2}, [r0,:64], r12
409 vext.8 d2, d16, d0, #6
410 vst1.8 {d0}, [r0,:64], r12
411 vext.8 d3, d16, d0, #4
412 vst1.8 {d2}, [r0,:64], r12
413 vext.8 d2, d16, d0, #2
414 vst1.8 {d3}, [r0,:64], r12
415 vst1.8 {d2}, [r0,:64], r12
416 vst1.8 {d16}, [r0,:64], r12
421 function x264_predict_8x8_hu_neon
422 mov r12, #FDEC_STRIDE
428 vext.8 d4, d7, d6, #2
429 vext.8 d2, d7, d6, #1
433 vrhadd.u8 d1, d16, d2
439 vext.8 q2, q0, q1, #2
440 vext.8 q3, q0, q1, #4
441 vext.8 q8, q0, q1, #6
442 vst1.8 {d0}, [r0,:64], r12
443 vst1.8 {d4}, [r0,:64], r12
444 vst1.8 {d6}, [r0,:64], r12
445 vst1.8 {d16}, [r0,:64], r12
447 vst1.8 {d1}, [r0,:64], r12
448 vst1.8 {d5}, [r0,:64], r12
449 vst1.8 {d7}, [r0,:64], r12
450 vst1.8 {d17}, [r0,:64]
454 function x264_predict_8x8c_dc_top_neon
455 sub r2, r0, #FDEC_STRIDE
457 vld1.8 {d0}, [r2,:64]
460 vrshrn.u16 d0, q0, #2
467 function x264_predict_8x8c_dc_left_neon
473 vrshrn.u16 d0, q0, #2
479 function x264_predict_8x8c_dc_neon
480 sub r2, r0, #FDEC_STRIDE
482 vld1.8 {d0}, [r2,:64]
489 vrshrn.u16 d2, q0, #3
490 vrshrn.u16 d3, q0, #2
497 add r2, r0, r1, lsl #2
499 vst1.8 {d0}, [r0,:64], r1
500 vst1.8 {d1}, [r2,:64], r1
505 function x264_predict_8x8c_h_neon
509 vld1.8 {d0[]}, [r1], ip
510 vld1.8 {d2[]}, [r1], ip
511 vst1.64 {d0}, [r0,:64], ip
512 vst1.64 {d2}, [r0,:64], ip
517 function x264_predict_8x8c_v_neon
518 sub r0, r0, #FDEC_STRIDE
520 vld1.64 {d0}, [r0,:64], ip
522 vst1.64 {d0}, [r0,:64], ip
527 function x264_predict_8x8c_p_neon
528 sub r3, r0, #FDEC_STRIDE
532 vld1.32 {d0[0]}, [r3]
533 vld1.32 {d2[0]}, [r2,:32], r1
534 ldcol.8 d0, r3, r1, 4, hi=1
536 ldcol.8 d3, r3, r1, 4
542 vld1.16 {q0}, [r3,:128]
549 vrshrn.s32 d4, q2, #5
556 vadd.i16 d16, d16, d0
559 vext.16 q0, q0, q0, #7
561 vmul.i16 q0, q0, d4[0]
567 vqshrun.s16 d0, q1, #5
569 vst1.8 {d0}, [r0,:64], r1
576 function x264_predict_8x16c_dc_top_neon
577 sub r2, r0, #FDEC_STRIDE
579 vld1.8 {d0}, [r2,:64]
582 vrshrn.u16 d0, q0, #2
587 add r2, r0, r1, lsl #2
589 vst1.8 {d0}, [r0,:64], r1
590 vst1.8 {d1}, [r2,:64], r1
592 add r2, r2, r1, lsl #2
593 add r0, r0, r1, lsl #2
595 vst1.8 {d0}, [r0,:64], r1
596 vst1.8 {d1}, [r2,:64], r1
601 function x264_predict_8x16c_h_neon
605 vld1.8 {d0[]}, [r1], ip
606 vld1.8 {d2[]}, [r1], ip
607 vst1.64 {d0}, [r0,:64], ip
608 vst1.64 {d2}, [r0,:64], ip
613 function x264_predict_8x16c_p_neon
614 sub r3, r0, #FDEC_STRIDE
618 vld1.32 {d0[0]}, [r3]
619 vld1.32 {d2[0]}, [r2,:32], r1
630 vld1.16 {q0}, [r3,:128]
635 vpaddl.s16 d4, d4 @ d4[0] = H
637 vpadd.s32 d6, d6 @ d6[0] = V
639 vadd.s32 d4, d4, d5 @ d4[0] = 17*H
641 vrshrn.s32 d4, q2, #5 @ d4[0] = b
642 vadd.s32 d6, d6, d7 @ d6[0] = 5*V
643 vrshrn.s32 d6, q3, #6 @ d6[0] = c
646 vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
648 vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
649 vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
651 vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
652 vshl.i16 d2, d16, #4 @ d3[0] = a + 16
653 vsub.i16 d2, d2, d3 @ i00
654 vext.16 q0, q0, q0, #7
656 vmul.i16 q0, q0, d4[0]
662 vqshrun.s16 d0, q1, #5
664 vst1.8 {d0}, [r0,:64], r1
671 function x264_predict_16x16_dc_top_neon
672 sub r2, r0, #FDEC_STRIDE
674 vld1.8 {q0}, [r2,:128]
675 add16x8 q0, d0, d1, d0, d1
676 vrshrn.u16 d0, q0, #4
681 function x264_predict_16x16_dc_left_neon
686 add16x8 q0, d0, d1, d0, d1
687 vrshrn.u16 d0, q0, #4
692 function x264_predict_16x16_dc_neon
693 sub r3, r0, #FDEC_STRIDE
695 vld1.64 {d0-d1}, [r3,:128]
696 ldrb ip, [r0], #FDEC_STRIDE
698 ldrb r1, [r0], #FDEC_STRIDE
703 ldrb r2, [r0], #FDEC_STRIDE
705 ldrb r3, [r0], #FDEC_STRIDE
707 ldrb r1, [r0], #FDEC_STRIDE
710 ldrb r2, [r0], #FDEC_STRIDE
712 ldrb r3, [r0], #FDEC_STRIDE
715 sub r0, r0, #FDEC_STRIDE*16
725 vst1.64 {d0-d1}, [r0,:128], r1
730 function x264_predict_16x16_h_neon
734 vld1.8 {d0[]}, [r1], ip
736 vld1.8 {d2[]}, [r1], ip
738 vst1.64 {d0-d1}, [r0,:128], ip
739 vst1.64 {d2-d3}, [r0,:128], ip
744 function x264_predict_16x16_v_neon
745 sub r0, r0, #FDEC_STRIDE
747 vld1.64 {d0-d1}, [r0,:128], ip
749 vst1.64 {d0-d1}, [r0,:128], ip
754 function x264_predict_16x16_p_neon
755 sub r3, r0, #FDEC_STRIDE
760 vld1.8 {d2}, [r2,:64], r1
769 vld1.8 {q0}, [r3,:128]
778 vrshrn.s32 d4, q2, #6
785 vadd.i16 d16, d16, d0
789 vext.16 q0, q0, q0, #7
792 vmul.i16 q0, q0, d4[0]
801 vqshrun.s16 d0, q1, #5
803 vqshrun.s16 d1, q1, #5
805 vst1.8 {q0}, [r0,:128], r1