1 /*****************************************************************************
2 * predict.S: arm intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
32 p16weight: .short 1,2,3,4,5,6,7,8
36 .macro ldcol.8 rd, rs, rt, n=8, hi=0
37 .if \n == 8 || \hi == 0
38 vld1.8 {\rd[0]}, [\rs], \rt
39 vld1.8 {\rd[1]}, [\rs], \rt
40 vld1.8 {\rd[2]}, [\rs], \rt
41 vld1.8 {\rd[3]}, [\rs], \rt
43 .if \n == 8 || \hi == 1
44 vld1.8 {\rd[4]}, [\rs], \rt
45 vld1.8 {\rd[5]}, [\rs], \rt
46 vld1.8 {\rd[6]}, [\rs], \rt
47 vld1.8 {\rd[7]}, [\rs], \rt
51 .macro add16x8 dq, dl, dh, rl, rh
52 vaddl.u8 \dq, \rl, \rh
53 vadd.u16 \dl, \dl, \dh
54 vpadd.u16 \dl, \dl, \dl
55 vpadd.u16 \dl, \dl, \dl
59 // because gcc doesn't believe in using the free shift in add
60 function x264_predict_4x4_h_armv6
61 ldrb r1, [r0, #0*FDEC_STRIDE-1]
62 ldrb r2, [r0, #1*FDEC_STRIDE-1]
63 ldrb r3, [r0, #2*FDEC_STRIDE-1]
64 ldrb ip, [r0, #3*FDEC_STRIDE-1]
65 add r1, r1, r1, lsl #8
66 add r2, r2, r2, lsl #8
67 add r3, r3, r3, lsl #8
68 add ip, ip, ip, lsl #8
69 add r1, r1, r1, lsl #16
70 str r1, [r0, #0*FDEC_STRIDE]
71 add r2, r2, r2, lsl #16
72 str r2, [r0, #1*FDEC_STRIDE]
73 add r3, r3, r3, lsl #16
74 str r3, [r0, #2*FDEC_STRIDE]
75 add ip, ip, ip, lsl #16
76 str ip, [r0, #3*FDEC_STRIDE]
80 function x264_predict_4x4_v_armv6
81 ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
82 str r1, [r0, #0 + 0 * FDEC_STRIDE]
83 str r1, [r0, #0 + 1 * FDEC_STRIDE]
84 str r1, [r0, #0 + 2 * FDEC_STRIDE]
85 str r1, [r0, #0 + 3 * FDEC_STRIDE]
89 function x264_predict_4x4_dc_armv6
91 ldr r1, [r0, #-FDEC_STRIDE]
92 ldrb r2, [r0, #0*FDEC_STRIDE-1]
93 ldrb r3, [r0, #1*FDEC_STRIDE-1]
96 ldrb ip, [r0, #2*FDEC_STRIDE-1]
98 ldrb r3, [r0, #3*FDEC_STRIDE-1]
103 add r1, r1, r1, lsl #8
104 add r1, r1, r1, lsl #16
105 str r1, [r0, #0*FDEC_STRIDE]
106 str r1, [r0, #1*FDEC_STRIDE]
107 str r1, [r0, #2*FDEC_STRIDE]
108 str r1, [r0, #3*FDEC_STRIDE]
112 function x264_predict_4x4_dc_top_neon
113 mov r12, #FDEC_STRIDE
114 sub r1, r0, #FDEC_STRIDE
115 vld1.32 d1[], [r1,:32]
120 vst1.32 d1[0], [r0,:32], r12
121 vst1.32 d1[0], [r0,:32], r12
122 vst1.32 d1[0], [r0,:32], r12
123 vst1.32 d1[0], [r0,:32], r12
127 // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
128 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
141 function x264_predict_4x4_ddr_armv6
142 ldr r1, [r0, # -FDEC_STRIDE]
143 ldrb r2, [r0, # -FDEC_STRIDE-1]
144 ldrb r3, [r0, #0*FDEC_STRIDE-1]
146 add r2, r2, r1, lsl #8
147 ldrb r4, [r0, #1*FDEC_STRIDE-1]
148 add r3, r3, r2, lsl #8
149 ldrb r5, [r0, #2*FDEC_STRIDE-1]
150 ldrb r6, [r0, #3*FDEC_STRIDE-1]
151 add r4, r4, r3, lsl #8
152 add r5, r5, r4, lsl #8
153 add r6, r6, r5, lsl #8
155 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
156 str r1, [r0, #0*FDEC_STRIDE]
161 add r2, r2, r4, lsr #24
162 str r2, [r0, #1*FDEC_STRIDE]
163 add r3, r3, r4, lsr #16
164 str r3, [r0, #2*FDEC_STRIDE]
165 add r5, r5, r4, lsr #8
166 str r5, [r0, #3*FDEC_STRIDE]
170 function x264_predict_4x4_ddl_neon
173 vld1.64 {d0}, [r0], ip
175 vext.8 d1, d0, d0, #1
176 vext.8 d2, d0, d3, #2
179 vst1.32 {d0[0]}, [r0,:32], ip
180 vext.8 d1, d0, d0, #1
181 vext.8 d2, d0, d0, #2
182 vst1.32 {d1[0]}, [r0,:32], ip
183 vext.8 d3, d0, d0, #3
184 vst1.32 {d2[0]}, [r0,:32], ip
185 vst1.32 {d3[0]}, [r0,:32], ip
189 function x264_predict_8x8_dc_neon
191 ldrd r2, r3, [r1, #8]
193 ldrd r4, r5, [r1, #16]
198 usada8 r2, r4, ip, r2
200 usada8 r3, r5, ip, r3
208 vst1.64 {d0}, [r0,:64], ip
213 function x264_predict_8x8_h_neon
219 vst1.64 {d0}, [r0,:64], ip
221 vst1.64 {d1}, [r0,:64], ip
223 vst1.64 {d2}, [r0,:64], ip
225 vst1.64 {d3}, [r0,:64], ip
227 vst1.64 {d4}, [r0,:64], ip
229 vst1.64 {d5}, [r0,:64], ip
231 vst1.64 {d6}, [r0,:64], ip
232 vst1.64 {d7}, [r0,:64], ip
236 function x264_predict_8x8_v_neon
238 mov r12, #FDEC_STRIDE
239 vld1.8 {d0}, [r1,:64]
241 vst1.8 {d0}, [r0,:64], r12
246 function x264_predict_8x8_ddl_neon
248 vld1.8 {d0, d1}, [r1,:128]
251 vext.8 q8, q3, q0, #15
252 vext.8 q2, q0, q1, #1
254 mov r12, #FDEC_STRIDE
256 vext.8 d2, d0, d1, #1
257 vext.8 d3, d0, d1, #2
258 vst1.8 d2, [r0,:64], r12
259 vext.8 d2, d0, d1, #3
260 vst1.8 d3, [r0,:64], r12
261 vext.8 d3, d0, d1, #4
262 vst1.8 d2, [r0,:64], r12
263 vext.8 d2, d0, d1, #5
264 vst1.8 d3, [r0,:64], r12
265 vext.8 d3, d0, d1, #6
266 vst1.8 d2, [r0,:64], r12
267 vext.8 d2, d0, d1, #7
268 vst1.8 d3, [r0,:64], r12
269 vst1.8 d2, [r0,:64], r12
270 vst1.8 d1, [r0,:64], r12
274 function x264_predict_8x8_ddr_neon
275 vld1.8 {d0-d3}, [r1,:128]
276 vext.8 q2, q0, q1, #7
277 vext.8 q3, q0, q1, #9
283 add r0, #7*FDEC_STRIDE
284 mov r12, #-1*FDEC_STRIDE
286 vext.8 d2, d0, d1, #1
287 vst1.8 {d0}, [r0,:64], r12
288 vext.8 d4, d0, d1, #2
289 vst1.8 {d2}, [r0,:64], r12
290 vext.8 d5, d0, d1, #3
291 vst1.8 {d4}, [r0,:64], r12
292 vext.8 d4, d0, d1, #4
293 vst1.8 {d5}, [r0,:64], r12
294 vext.8 d5, d0, d1, #5
295 vst1.8 {d4}, [r0,:64], r12
296 vext.8 d4, d0, d1, #6
297 vst1.8 {d5}, [r0,:64], r12
298 vext.8 d5, d0, d1, #7
299 vst1.8 {d4}, [r0,:64], r12
300 vst1.8 {d5}, [r0,:64], r12
304 function x264_predict_8x8_vl_neon
306 mov r12, #FDEC_STRIDE
308 vld1.8 {d0, d1}, [r1,:128]
309 vext.8 q1, q1, q0, #15
310 vext.8 q2, q0, q2, #1
317 vext.8 d2, d0, d1, #1
318 vst1.8 {d6}, [r0,:64], r12
319 vext.8 d3, d6, d7, #1
320 vst1.8 {d2}, [r0,:64], r12
321 vext.8 d2, d0, d1, #2
322 vst1.8 {d3}, [r0,:64], r12
323 vext.8 d3, d6, d7, #2
324 vst1.8 {d2}, [r0,:64], r12
325 vext.8 d2, d0, d1, #3
326 vst1.8 {d3}, [r0,:64], r12
327 vext.8 d3, d6, d7, #3
328 vst1.8 {d2}, [r0,:64], r12
329 vext.8 d2, d0, d1, #4
330 vst1.8 {d3}, [r0,:64], r12
331 vst1.8 {d2}, [r0,:64], r12
335 function x264_predict_8x8_vr_neon
337 mov r12, #FDEC_STRIDE
338 vld1.8 {d4,d5}, [r1,:64]
340 vext.8 q1, q2, q2, #14
341 vext.8 q0, q2, q2, #15
349 vst1.8 {d5}, [r0,:64], r12
351 vst1.8 {d1}, [r0,:64], r12
352 vext.8 d6, d0, d5, #7
353 vext.8 d3, d2, d1, #7
354 vst1.8 {d6}, [r0,:64], r12
355 vst1.8 {d3}, [r0,:64], r12
356 vext.8 d6, d0, d5, #6
357 vext.8 d3, d2, d1, #6
358 vst1.8 {d6}, [r0,:64], r12
359 vst1.8 {d3}, [r0,:64], r12
360 vext.8 d6, d0, d5, #5
361 vext.8 d3, d2, d1, #5
362 vst1.8 {d6}, [r0,:64], r12
363 vst1.8 {d3}, [r0,:64], r12
367 function x264_predict_8x8_hd_neon
368 mov r12, #FDEC_STRIDE
372 vext.8 q3, q1, q1, #1
373 vext.8 q2, q1, q1, #2
382 vext.8 d2, d0, d1, #6
383 vext.8 d3, d0, d1, #4
384 vst1.8 {d2}, [r0,:64], r12
385 vext.8 d2, d0, d1, #2
386 vst1.8 {d3}, [r0,:64], r12
387 vst1.8 {d2}, [r0,:64], r12
388 vext.8 d2, d16, d0, #6
389 vst1.8 {d0}, [r0,:64], r12
390 vext.8 d3, d16, d0, #4
391 vst1.8 {d2}, [r0,:64], r12
392 vext.8 d2, d16, d0, #2
393 vst1.8 {d3}, [r0,:64], r12
394 vst1.8 {d2}, [r0,:64], r12
395 vst1.8 {d16}, [r0,:64], r12
400 function x264_predict_8x8_hu_neon
401 mov r12, #FDEC_STRIDE
407 vext.8 d4, d7, d6, #2
408 vext.8 d2, d7, d6, #1
412 vrhadd.u8 d1, d16, d2
418 vext.8 q2, q0, q1, #2
419 vext.8 q3, q0, q1, #4
420 vext.8 q8, q0, q1, #6
421 vst1.8 {d0}, [r0,:64], r12
422 vst1.8 {d4}, [r0,:64], r12
423 vst1.8 {d6}, [r0,:64], r12
424 vst1.8 {d16}, [r0,:64], r12
426 vst1.8 {d1}, [r0,:64], r12
427 vst1.8 {d5}, [r0,:64], r12
428 vst1.8 {d7}, [r0,:64], r12
429 vst1.8 {d17}, [r0,:64]
433 function x264_predict_8x8c_dc_top_neon
434 sub r2, r0, #FDEC_STRIDE
436 vld1.8 {d0}, [r2,:64]
439 vrshrn.u16 d0, q0, #2
446 function x264_predict_8x8c_dc_left_neon
452 vrshrn.u16 d0, q0, #2
458 function x264_predict_8x8c_dc_neon
459 sub r2, r0, #FDEC_STRIDE
461 vld1.8 {d0}, [r2,:64]
468 vrshrn.u16 d2, q0, #3
469 vrshrn.u16 d3, q0, #2
476 add r2, r0, r1, lsl #2
478 vst1.8 {d0}, [r0,:64], r1
479 vst1.8 {d1}, [r2,:64], r1
484 function x264_predict_8x8c_h_neon
488 vld1.8 {d0[]}, [r1], ip
489 vld1.8 {d2[]}, [r1], ip
490 vst1.64 {d0}, [r0,:64], ip
491 vst1.64 {d2}, [r0,:64], ip
496 function x264_predict_8x8c_v_neon
497 sub r0, r0, #FDEC_STRIDE
499 vld1.64 {d0}, [r0,:64], ip
501 vst1.64 {d0}, [r0,:64], ip
506 function x264_predict_8x8c_p_neon
507 sub r3, r0, #FDEC_STRIDE
511 vld1.32 {d0[0]}, [r3]
512 vld1.32 {d2[0]}, [r2,:32], r1
513 ldcol.8 d0, r3, r1, 4, hi=1
515 ldcol.8 d3, r3, r1, 4
521 vld1.16 {q0}, [r3,:128]
528 vrshrn.s32 d4, q2, #5
535 vadd.i16 d16, d16, d0
539 vext.16 q0, q0, q0, #7
542 vmul.i16 q0, q0, d4[0]
551 vqshrun.s16 d0, q1, #5
553 vst1.8 {d0}, [r0,:64], r1
560 function x264_predict_16x16_dc_top_neon
561 sub r2, r0, #FDEC_STRIDE
563 vld1.8 {q0}, [r2,:128]
564 add16x8 q0, d0, d1, d0, d1
565 vrshrn.u16 d0, q0, #4
570 function x264_predict_16x16_dc_left_neon
575 add16x8 q0, d0, d1, d0, d1
576 vrshrn.u16 d0, q0, #4
581 function x264_predict_16x16_dc_neon
582 sub r3, r0, #FDEC_STRIDE
584 vld1.64 {d0-d1}, [r3,:128]
585 ldrb ip, [r0], #FDEC_STRIDE
587 ldrb r1, [r0], #FDEC_STRIDE
592 ldrb r2, [r0], #FDEC_STRIDE
594 ldrb r3, [r0], #FDEC_STRIDE
596 ldrb r1, [r0], #FDEC_STRIDE
599 ldrb r2, [r0], #FDEC_STRIDE
601 ldrb r3, [r0], #FDEC_STRIDE
604 sub r0, r0, #FDEC_STRIDE*16
614 vst1.64 {d0-d1}, [r0,:128], r1
619 function x264_predict_16x16_h_neon
623 vld1.8 {d0[]}, [r1], ip
625 vld1.8 {d2[]}, [r1], ip
627 vst1.64 {d0-d1}, [r0,:128], ip
628 vst1.64 {d2-d3}, [r0,:128], ip
633 function x264_predict_16x16_v_neon
634 sub r0, r0, #FDEC_STRIDE
636 vld1.64 {d0-d1}, [r0,:128], ip
638 vst1.64 {d0-d1}, [r0,:128], ip
643 function x264_predict_16x16_p_neon
644 sub r3, r0, #FDEC_STRIDE
649 vld1.8 {d2}, [r2,:64], r1
658 vld1.8 {q0}, [r3,:128]
667 vrshrn.s32 d4, q2, #6
674 vadd.i16 d16, d16, d0
678 vext.16 q0, q0, q0, #7
681 vmul.i16 q0, q0, d4[0]
690 vqshrun.s16 d0, q1, #5
692 vqshrun.s16 d1, q1, #5
694 vst1.8 {q0}, [r0,:128], r1