1 /*****************************************************************************
2 * predict.S: arm intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
34 p16weight: .short 1,2,3,4,5,6,7,8
38 .macro ldcol.8 rd, rs, rt, n=8, hi=0
39 .if \n == 8 || \hi == 0
40 vld1.8 {\rd[0]}, [\rs], \rt
41 vld1.8 {\rd[1]}, [\rs], \rt
42 vld1.8 {\rd[2]}, [\rs], \rt
43 vld1.8 {\rd[3]}, [\rs], \rt
45 .if \n == 8 || \hi == 1
46 vld1.8 {\rd[4]}, [\rs], \rt
47 vld1.8 {\rd[5]}, [\rs], \rt
48 vld1.8 {\rd[6]}, [\rs], \rt
49 vld1.8 {\rd[7]}, [\rs], \rt
53 .macro add16x8 dq, dl, dh, rl, rh
54 vaddl.u8 \dq, \rl, \rh
55 vadd.u16 \dl, \dl, \dh
56 vpadd.u16 \dl, \dl, \dl
57 vpadd.u16 \dl, \dl, \dl
61 // because gcc doesn't believe in using the free shift in add
62 function x264_predict_4x4_h_armv6
63 ldrb r1, [r0, #0*FDEC_STRIDE-1]
64 ldrb r2, [r0, #1*FDEC_STRIDE-1]
65 ldrb r3, [r0, #2*FDEC_STRIDE-1]
66 ldrb ip, [r0, #3*FDEC_STRIDE-1]
67 add r1, r1, r1, lsl #8
68 add r2, r2, r2, lsl #8
69 add r3, r3, r3, lsl #8
70 add ip, ip, ip, lsl #8
71 add r1, r1, r1, lsl #16
72 str r1, [r0, #0*FDEC_STRIDE]
73 add r2, r2, r2, lsl #16
74 str r2, [r0, #1*FDEC_STRIDE]
75 add r3, r3, r3, lsl #16
76 str r3, [r0, #2*FDEC_STRIDE]
77 add ip, ip, ip, lsl #16
78 str ip, [r0, #3*FDEC_STRIDE]
82 function x264_predict_4x4_v_armv6
83 ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
84 str r1, [r0, #0 + 0 * FDEC_STRIDE]
85 str r1, [r0, #0 + 1 * FDEC_STRIDE]
86 str r1, [r0, #0 + 2 * FDEC_STRIDE]
87 str r1, [r0, #0 + 3 * FDEC_STRIDE]
91 function x264_predict_4x4_dc_armv6
93 ldr r1, [r0, #-FDEC_STRIDE]
94 ldrb r2, [r0, #0*FDEC_STRIDE-1]
95 ldrb r3, [r0, #1*FDEC_STRIDE-1]
98 ldrb ip, [r0, #2*FDEC_STRIDE-1]
100 ldrb r3, [r0, #3*FDEC_STRIDE-1]
105 add r1, r1, r1, lsl #8
106 add r1, r1, r1, lsl #16
107 str r1, [r0, #0*FDEC_STRIDE]
108 str r1, [r0, #1*FDEC_STRIDE]
109 str r1, [r0, #2*FDEC_STRIDE]
110 str r1, [r0, #3*FDEC_STRIDE]
114 function x264_predict_4x4_dc_top_neon
115 mov r12, #FDEC_STRIDE
116 sub r1, r0, #FDEC_STRIDE
117 vld1.32 d1[], [r1,:32]
122 vst1.32 d1[0], [r0,:32], r12
123 vst1.32 d1[0], [r0,:32], r12
124 vst1.32 d1[0], [r0,:32], r12
125 vst1.32 d1[0], [r0,:32], r12
129 // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
130 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
143 function x264_predict_4x4_ddr_armv6
144 ldr r1, [r0, # -FDEC_STRIDE]
145 ldrb r2, [r0, # -FDEC_STRIDE-1]
146 ldrb r3, [r0, #0*FDEC_STRIDE-1]
148 add r2, r2, r1, lsl #8
149 ldrb r4, [r0, #1*FDEC_STRIDE-1]
150 add r3, r3, r2, lsl #8
151 ldrb r5, [r0, #2*FDEC_STRIDE-1]
152 ldrb r6, [r0, #3*FDEC_STRIDE-1]
153 add r4, r4, r3, lsl #8
154 add r5, r5, r4, lsl #8
155 add r6, r6, r5, lsl #8
157 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
158 str r1, [r0, #0*FDEC_STRIDE]
163 add r2, r2, r4, lsr #24
164 str r2, [r0, #1*FDEC_STRIDE]
165 add r3, r3, r4, lsr #16
166 str r3, [r0, #2*FDEC_STRIDE]
167 add r5, r5, r4, lsr #8
168 str r5, [r0, #3*FDEC_STRIDE]
172 function x264_predict_4x4_ddl_neon
175 vld1.64 {d0}, [r0], ip
177 vext.8 d1, d0, d0, #1
178 vext.8 d2, d0, d3, #2
181 vst1.32 {d0[0]}, [r0,:32], ip
182 vext.8 d1, d0, d0, #1
183 vext.8 d2, d0, d0, #2
184 vst1.32 {d1[0]}, [r0,:32], ip
185 vext.8 d3, d0, d0, #3
186 vst1.32 {d2[0]}, [r0,:32], ip
187 vst1.32 {d3[0]}, [r0,:32], ip
191 function x264_predict_8x8_dc_neon
193 ldrd r2, r3, [r1, #8]
195 ldrd r4, r5, [r1, #16]
200 usada8 r2, r4, ip, r2
202 usada8 r3, r5, ip, r3
210 vst1.64 {d0}, [r0,:64], ip
215 function x264_predict_8x8_h_neon
221 vst1.64 {d0}, [r0,:64], ip
223 vst1.64 {d1}, [r0,:64], ip
225 vst1.64 {d2}, [r0,:64], ip
227 vst1.64 {d3}, [r0,:64], ip
229 vst1.64 {d4}, [r0,:64], ip
231 vst1.64 {d5}, [r0,:64], ip
233 vst1.64 {d6}, [r0,:64], ip
234 vst1.64 {d7}, [r0,:64], ip
238 function x264_predict_8x8_v_neon
240 mov r12, #FDEC_STRIDE
241 vld1.8 {d0}, [r1,:64]
243 vst1.8 {d0}, [r0,:64], r12
248 function x264_predict_8x8_ddl_neon
250 vld1.8 {d0, d1}, [r1,:128]
253 vext.8 q8, q3, q0, #15
254 vext.8 q2, q0, q1, #1
256 mov r12, #FDEC_STRIDE
258 vext.8 d2, d0, d1, #1
259 vext.8 d3, d0, d1, #2
260 vst1.8 d2, [r0,:64], r12
261 vext.8 d2, d0, d1, #3
262 vst1.8 d3, [r0,:64], r12
263 vext.8 d3, d0, d1, #4
264 vst1.8 d2, [r0,:64], r12
265 vext.8 d2, d0, d1, #5
266 vst1.8 d3, [r0,:64], r12
267 vext.8 d3, d0, d1, #6
268 vst1.8 d2, [r0,:64], r12
269 vext.8 d2, d0, d1, #7
270 vst1.8 d3, [r0,:64], r12
271 vst1.8 d2, [r0,:64], r12
272 vst1.8 d1, [r0,:64], r12
276 function x264_predict_8x8_ddr_neon
277 vld1.8 {d0-d3}, [r1,:128]
278 vext.8 q2, q0, q1, #7
279 vext.8 q3, q0, q1, #9
285 add r0, #7*FDEC_STRIDE
286 mov r12, #-1*FDEC_STRIDE
288 vext.8 d2, d0, d1, #1
289 vst1.8 {d0}, [r0,:64], r12
290 vext.8 d4, d0, d1, #2
291 vst1.8 {d2}, [r0,:64], r12
292 vext.8 d5, d0, d1, #3
293 vst1.8 {d4}, [r0,:64], r12
294 vext.8 d4, d0, d1, #4
295 vst1.8 {d5}, [r0,:64], r12
296 vext.8 d5, d0, d1, #5
297 vst1.8 {d4}, [r0,:64], r12
298 vext.8 d4, d0, d1, #6
299 vst1.8 {d5}, [r0,:64], r12
300 vext.8 d5, d0, d1, #7
301 vst1.8 {d4}, [r0,:64], r12
302 vst1.8 {d5}, [r0,:64], r12
306 function x264_predict_8x8_vl_neon
308 mov r12, #FDEC_STRIDE
310 vld1.8 {d0, d1}, [r1,:128]
311 vext.8 q1, q1, q0, #15
312 vext.8 q2, q0, q2, #1
319 vext.8 d2, d0, d1, #1
320 vst1.8 {d6}, [r0,:64], r12
321 vext.8 d3, d6, d7, #1
322 vst1.8 {d2}, [r0,:64], r12
323 vext.8 d2, d0, d1, #2
324 vst1.8 {d3}, [r0,:64], r12
325 vext.8 d3, d6, d7, #2
326 vst1.8 {d2}, [r0,:64], r12
327 vext.8 d2, d0, d1, #3
328 vst1.8 {d3}, [r0,:64], r12
329 vext.8 d3, d6, d7, #3
330 vst1.8 {d2}, [r0,:64], r12
331 vext.8 d2, d0, d1, #4
332 vst1.8 {d3}, [r0,:64], r12
333 vst1.8 {d2}, [r0,:64], r12
337 function x264_predict_8x8_vr_neon
339 mov r12, #FDEC_STRIDE
340 vld1.8 {d4,d5}, [r1,:64]
342 vext.8 q1, q2, q2, #14
343 vext.8 q0, q2, q2, #15
351 vst1.8 {d5}, [r0,:64], r12
353 vst1.8 {d1}, [r0,:64], r12
354 vext.8 d6, d0, d5, #7
355 vext.8 d3, d2, d1, #7
356 vst1.8 {d6}, [r0,:64], r12
357 vst1.8 {d3}, [r0,:64], r12
358 vext.8 d6, d0, d5, #6
359 vext.8 d3, d2, d1, #6
360 vst1.8 {d6}, [r0,:64], r12
361 vst1.8 {d3}, [r0,:64], r12
362 vext.8 d6, d0, d5, #5
363 vext.8 d3, d2, d1, #5
364 vst1.8 {d6}, [r0,:64], r12
365 vst1.8 {d3}, [r0,:64], r12
369 function x264_predict_8x8_hd_neon
370 mov r12, #FDEC_STRIDE
374 vext.8 q3, q1, q1, #1
375 vext.8 q2, q1, q1, #2
384 vext.8 d2, d0, d1, #6
385 vext.8 d3, d0, d1, #4
386 vst1.8 {d2}, [r0,:64], r12
387 vext.8 d2, d0, d1, #2
388 vst1.8 {d3}, [r0,:64], r12
389 vst1.8 {d2}, [r0,:64], r12
390 vext.8 d2, d16, d0, #6
391 vst1.8 {d0}, [r0,:64], r12
392 vext.8 d3, d16, d0, #4
393 vst1.8 {d2}, [r0,:64], r12
394 vext.8 d2, d16, d0, #2
395 vst1.8 {d3}, [r0,:64], r12
396 vst1.8 {d2}, [r0,:64], r12
397 vst1.8 {d16}, [r0,:64], r12
402 function x264_predict_8x8_hu_neon
403 mov r12, #FDEC_STRIDE
409 vext.8 d4, d7, d6, #2
410 vext.8 d2, d7, d6, #1
414 vrhadd.u8 d1, d16, d2
420 vext.8 q2, q0, q1, #2
421 vext.8 q3, q0, q1, #4
422 vext.8 q8, q0, q1, #6
423 vst1.8 {d0}, [r0,:64], r12
424 vst1.8 {d4}, [r0,:64], r12
425 vst1.8 {d6}, [r0,:64], r12
426 vst1.8 {d16}, [r0,:64], r12
428 vst1.8 {d1}, [r0,:64], r12
429 vst1.8 {d5}, [r0,:64], r12
430 vst1.8 {d7}, [r0,:64], r12
431 vst1.8 {d17}, [r0,:64]
435 function x264_predict_8x8c_dc_top_neon
436 sub r2, r0, #FDEC_STRIDE
438 vld1.8 {d0}, [r2,:64]
441 vrshrn.u16 d0, q0, #2
448 function x264_predict_8x8c_dc_left_neon
454 vrshrn.u16 d0, q0, #2
460 function x264_predict_8x8c_dc_neon
461 sub r2, r0, #FDEC_STRIDE
463 vld1.8 {d0}, [r2,:64]
470 vrshrn.u16 d2, q0, #3
471 vrshrn.u16 d3, q0, #2
478 add r2, r0, r1, lsl #2
480 vst1.8 {d0}, [r0,:64], r1
481 vst1.8 {d1}, [r2,:64], r1
486 function x264_predict_8x8c_h_neon
490 vld1.8 {d0[]}, [r1], ip
491 vld1.8 {d2[]}, [r1], ip
492 vst1.64 {d0}, [r0,:64], ip
493 vst1.64 {d2}, [r0,:64], ip
498 function x264_predict_8x8c_v_neon
499 sub r0, r0, #FDEC_STRIDE
501 vld1.64 {d0}, [r0,:64], ip
503 vst1.64 {d0}, [r0,:64], ip
508 function x264_predict_8x8c_p_neon
509 sub r3, r0, #FDEC_STRIDE
513 vld1.32 {d0[0]}, [r3]
514 vld1.32 {d2[0]}, [r2,:32], r1
515 ldcol.8 d0, r3, r1, 4, hi=1
517 ldcol.8 d3, r3, r1, 4
523 vld1.16 {q0}, [r3,:128]
530 vrshrn.s32 d4, q2, #5
537 vadd.i16 d16, d16, d0
541 vext.16 q0, q0, q0, #7
544 vmul.i16 q0, q0, d4[0]
553 vqshrun.s16 d0, q1, #5
555 vst1.8 {d0}, [r0,:64], r1
562 function x264_predict_16x16_dc_top_neon
563 sub r2, r0, #FDEC_STRIDE
565 vld1.8 {q0}, [r2,:128]
566 add16x8 q0, d0, d1, d0, d1
567 vrshrn.u16 d0, q0, #4
572 function x264_predict_16x16_dc_left_neon
577 add16x8 q0, d0, d1, d0, d1
578 vrshrn.u16 d0, q0, #4
583 function x264_predict_16x16_dc_neon
584 sub r3, r0, #FDEC_STRIDE
586 vld1.64 {d0-d1}, [r3,:128]
587 ldrb ip, [r0], #FDEC_STRIDE
589 ldrb r1, [r0], #FDEC_STRIDE
594 ldrb r2, [r0], #FDEC_STRIDE
596 ldrb r3, [r0], #FDEC_STRIDE
598 ldrb r1, [r0], #FDEC_STRIDE
601 ldrb r2, [r0], #FDEC_STRIDE
603 ldrb r3, [r0], #FDEC_STRIDE
606 sub r0, r0, #FDEC_STRIDE*16
616 vst1.64 {d0-d1}, [r0,:128], r1
621 function x264_predict_16x16_h_neon
625 vld1.8 {d0[]}, [r1], ip
627 vld1.8 {d2[]}, [r1], ip
629 vst1.64 {d0-d1}, [r0,:128], ip
630 vst1.64 {d2-d3}, [r0,:128], ip
635 function x264_predict_16x16_v_neon
636 sub r0, r0, #FDEC_STRIDE
638 vld1.64 {d0-d1}, [r0,:128], ip
640 vst1.64 {d0-d1}, [r0,:128], ip
645 function x264_predict_16x16_p_neon
646 sub r3, r0, #FDEC_STRIDE
651 vld1.8 {d2}, [r2,:64], r1
660 vld1.8 {q0}, [r3,:128]
669 vrshrn.s32 d4, q2, #6
676 vadd.i16 d16, d16, d0
680 vext.16 q0, q0, q0, #7
683 vmul.i16 q0, q0, d4[0]
692 vqshrun.s16 d0, q1, #5
694 vqshrun.s16 d1, q1, #5
696 vst1.8 {q0}, [r0,:128], r1