1 /*****************************************************************************
2 * predict.S: arm intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
34 p16weight: .short 1,2,3,4,5,6,7,8
38 .macro ldcol.8 rd, rs, rt, n=8, hi=0
39 .if \n == 8 || \hi == 0
40 vld1.8 {\rd[0]}, [\rs], \rt
41 vld1.8 {\rd[1]}, [\rs], \rt
42 vld1.8 {\rd[2]}, [\rs], \rt
43 vld1.8 {\rd[3]}, [\rs], \rt
45 .if \n == 8 || \hi == 1
46 vld1.8 {\rd[4]}, [\rs], \rt
47 vld1.8 {\rd[5]}, [\rs], \rt
48 vld1.8 {\rd[6]}, [\rs], \rt
49 vld1.8 {\rd[7]}, [\rs], \rt
53 .macro add16x8 dq, dl, dh, rl, rh
54 vaddl.u8 \dq, \rl, \rh
55 vadd.u16 \dl, \dl, \dh
56 vpadd.u16 \dl, \dl, \dl
57 vpadd.u16 \dl, \dl, \dl
61 // because gcc doesn't believe in using the free shift in add
62 function x264_predict_4x4_h_armv6
63 ldrb r1, [r0, #0*FDEC_STRIDE-1]
64 ldrb r2, [r0, #1*FDEC_STRIDE-1]
65 ldrb r3, [r0, #2*FDEC_STRIDE-1]
66 ldrb ip, [r0, #3*FDEC_STRIDE-1]
67 add r1, r1, r1, lsl #8
68 add r2, r2, r2, lsl #8
69 add r3, r3, r3, lsl #8
70 add ip, ip, ip, lsl #8
71 add r1, r1, r1, lsl #16
72 str r1, [r0, #0*FDEC_STRIDE]
73 add r2, r2, r2, lsl #16
74 str r2, [r0, #1*FDEC_STRIDE]
75 add r3, r3, r3, lsl #16
76 str r3, [r0, #2*FDEC_STRIDE]
77 add ip, ip, ip, lsl #16
78 str ip, [r0, #3*FDEC_STRIDE]
82 function x264_predict_4x4_dc_armv6
84 ldr r1, [r0, #-FDEC_STRIDE]
85 ldrb r2, [r0, #0*FDEC_STRIDE-1]
86 ldrb r3, [r0, #1*FDEC_STRIDE-1]
89 ldrb ip, [r0, #2*FDEC_STRIDE-1]
91 ldrb r3, [r0, #3*FDEC_STRIDE-1]
96 add r1, r1, r1, lsl #8
97 add r1, r1, r1, lsl #16
98 str r1, [r0, #0*FDEC_STRIDE]
99 str r1, [r0, #1*FDEC_STRIDE]
100 str r1, [r0, #2*FDEC_STRIDE]
101 str r1, [r0, #3*FDEC_STRIDE]
105 function x264_predict_4x4_dc_top_neon
106 mov r12, #FDEC_STRIDE
107 sub r1, r0, #FDEC_STRIDE
108 vld1.32 d1[], [r1,:32]
113 vst1.32 d1[0], [r0,:32], r12
114 vst1.32 d1[0], [r0,:32], r12
115 vst1.32 d1[0], [r0,:32], r12
116 vst1.32 d1[0], [r0,:32], r12
120 // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
121 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
134 function x264_predict_4x4_ddr_armv6
135 ldr r1, [r0, # -FDEC_STRIDE]
136 ldrb r2, [r0, # -FDEC_STRIDE-1]
137 ldrb r3, [r0, #0*FDEC_STRIDE-1]
139 add r2, r2, r1, lsl #8
140 ldrb r4, [r0, #1*FDEC_STRIDE-1]
141 add r3, r3, r2, lsl #8
142 ldrb r5, [r0, #2*FDEC_STRIDE-1]
143 ldrb r6, [r0, #3*FDEC_STRIDE-1]
144 add r4, r4, r3, lsl #8
145 add r5, r5, r4, lsl #8
146 add r6, r6, r5, lsl #8
148 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
149 str r1, [r0, #0*FDEC_STRIDE]
154 add r2, r2, r4, lsr #24
155 str r2, [r0, #1*FDEC_STRIDE]
156 add r3, r3, r4, lsr #16
157 str r3, [r0, #2*FDEC_STRIDE]
158 add r5, r5, r4, lsr #8
159 str r5, [r0, #3*FDEC_STRIDE]
163 function x264_predict_4x4_ddl_neon
166 vld1.64 {d0}, [r0], ip
168 vext.8 d1, d0, d0, #1
169 vext.8 d2, d0, d3, #2
172 vst1.32 {d0[0]}, [r0,:32], ip
173 vext.8 d1, d0, d0, #1
174 vext.8 d2, d0, d0, #2
175 vst1.32 {d1[0]}, [r0,:32], ip
176 vext.8 d3, d0, d0, #3
177 vst1.32 {d2[0]}, [r0,:32], ip
178 vst1.32 {d3[0]}, [r0,:32], ip
182 function x264_predict_8x8_dc_neon
184 ldrd r2, r3, [r1, #8]
186 ldrd r4, r5, [r1, #16]
191 usada8 r2, r4, ip, r2
193 usada8 r3, r5, ip, r3
201 vst1.64 {d0}, [r0,:64], ip
206 function x264_predict_8x8_h_neon
212 vst1.64 {d0}, [r0,:64], ip
214 vst1.64 {d1}, [r0,:64], ip
216 vst1.64 {d2}, [r0,:64], ip
218 vst1.64 {d3}, [r0,:64], ip
220 vst1.64 {d4}, [r0,:64], ip
222 vst1.64 {d5}, [r0,:64], ip
224 vst1.64 {d6}, [r0,:64], ip
225 vst1.64 {d7}, [r0,:64], ip
229 function x264_predict_8x8_v_neon
231 mov r12, #FDEC_STRIDE
232 vld1.8 {d0}, [r1,:64]
234 vst1.8 {d0}, [r0,:64], r12
239 function x264_predict_8x8_ddl_neon
241 vld1.8 {d0, d1}, [r1,:128]
244 vext.8 q8, q3, q0, #15
245 vext.8 q2, q0, q1, #1
247 mov r12, #FDEC_STRIDE
249 vext.8 d2, d0, d1, #1
250 vext.8 d3, d0, d1, #2
251 vst1.8 d2, [r0,:64], r12
252 vext.8 d2, d0, d1, #3
253 vst1.8 d3, [r0,:64], r12
254 vext.8 d3, d0, d1, #4
255 vst1.8 d2, [r0,:64], r12
256 vext.8 d2, d0, d1, #5
257 vst1.8 d3, [r0,:64], r12
258 vext.8 d3, d0, d1, #6
259 vst1.8 d2, [r0,:64], r12
260 vext.8 d2, d0, d1, #7
261 vst1.8 d3, [r0,:64], r12
262 vst1.8 d2, [r0,:64], r12
263 vst1.8 d1, [r0,:64], r12
267 function x264_predict_8x8_ddr_neon
268 vld1.8 {d0-d3}, [r1,:128]
269 vext.8 q2, q0, q1, #7
270 vext.8 q3, q0, q1, #9
276 add r0, #7*FDEC_STRIDE
277 mov r12, #-1*FDEC_STRIDE
279 vext.8 d2, d0, d1, #1
280 vst1.8 {d0}, [r0,:64], r12
281 vext.8 d4, d0, d1, #2
282 vst1.8 {d2}, [r0,:64], r12
283 vext.8 d5, d0, d1, #3
284 vst1.8 {d4}, [r0,:64], r12
285 vext.8 d4, d0, d1, #4
286 vst1.8 {d5}, [r0,:64], r12
287 vext.8 d5, d0, d1, #5
288 vst1.8 {d4}, [r0,:64], r12
289 vext.8 d4, d0, d1, #6
290 vst1.8 {d5}, [r0,:64], r12
291 vext.8 d5, d0, d1, #7
292 vst1.8 {d4}, [r0,:64], r12
293 vst1.8 {d5}, [r0,:64], r12
297 function x264_predict_8x8_vl_neon
299 mov r12, #FDEC_STRIDE
301 vld1.8 {d0, d1}, [r1,:128]
302 vext.8 q1, q1, q0, #15
303 vext.8 q2, q0, q2, #1
310 vext.8 d2, d0, d1, #1
311 vst1.8 {d6}, [r0,:64], r12
312 vext.8 d3, d6, d7, #1
313 vst1.8 {d2}, [r0,:64], r12
314 vext.8 d2, d0, d1, #2
315 vst1.8 {d3}, [r0,:64], r12
316 vext.8 d3, d6, d7, #2
317 vst1.8 {d2}, [r0,:64], r12
318 vext.8 d2, d0, d1, #3
319 vst1.8 {d3}, [r0,:64], r12
320 vext.8 d3, d6, d7, #3
321 vst1.8 {d2}, [r0,:64], r12
322 vext.8 d2, d0, d1, #4
323 vst1.8 {d3}, [r0,:64], r12
324 vst1.8 {d2}, [r0,:64], r12
328 function x264_predict_8x8_vr_neon
330 mov r12, #FDEC_STRIDE
331 vld1.8 {d4,d5}, [r1,:64]
333 vext.8 q1, q2, q2, #14
334 vext.8 q0, q2, q2, #15
342 vst1.8 {d5}, [r0,:64], r12
344 vst1.8 {d1}, [r0,:64], r12
345 vext.8 d6, d0, d5, #7
346 vext.8 d3, d2, d1, #7
347 vst1.8 {d6}, [r0,:64], r12
348 vst1.8 {d3}, [r0,:64], r12
349 vext.8 d6, d0, d5, #6
350 vext.8 d3, d2, d1, #6
351 vst1.8 {d6}, [r0,:64], r12
352 vst1.8 {d3}, [r0,:64], r12
353 vext.8 d6, d0, d5, #5
354 vext.8 d3, d2, d1, #5
355 vst1.8 {d6}, [r0,:64], r12
356 vst1.8 {d3}, [r0,:64], r12
360 function x264_predict_8x8_hd_neon
361 mov r12, #FDEC_STRIDE
365 vext.8 q3, q1, q1, #1
366 vext.8 q2, q1, q1, #2
375 vext.8 d2, d0, d1, #6
376 vext.8 d3, d0, d1, #4
377 vst1.8 {d2}, [r0,:64], r12
378 vext.8 d2, d0, d1, #2
379 vst1.8 {d3}, [r0,:64], r12
380 vst1.8 {d2}, [r0,:64], r12
381 vext.8 d2, d16, d0, #6
382 vst1.8 {d0}, [r0,:64], r12
383 vext.8 d3, d16, d0, #4
384 vst1.8 {d2}, [r0,:64], r12
385 vext.8 d2, d16, d0, #2
386 vst1.8 {d3}, [r0,:64], r12
387 vst1.8 {d2}, [r0,:64], r12
388 vst1.8 {d16}, [r0,:64], r12
393 function x264_predict_8x8_hu_neon
394 mov r12, #FDEC_STRIDE
400 vext.8 d4, d7, d6, #2
401 vext.8 d2, d7, d6, #1
405 vrhadd.u8 d1, d16, d2
411 vext.8 q2, q0, q1, #2
412 vext.8 q3, q0, q1, #4
413 vext.8 q8, q0, q1, #6
414 vst1.8 {d0}, [r0,:64], r12
415 vst1.8 {d4}, [r0,:64], r12
416 vst1.8 {d6}, [r0,:64], r12
417 vst1.8 {d16}, [r0,:64], r12
419 vst1.8 {d1}, [r0,:64], r12
420 vst1.8 {d5}, [r0,:64], r12
421 vst1.8 {d7}, [r0,:64], r12
422 vst1.8 {d17}, [r0,:64]
426 function x264_predict_8x8c_dc_top_neon
427 sub r2, r0, #FDEC_STRIDE
429 vld1.8 {d0}, [r2,:64]
432 vrshrn.u16 d0, q0, #2
439 function x264_predict_8x8c_dc_left_neon
445 vrshrn.u16 d0, q0, #2
451 function x264_predict_8x8c_dc_neon
452 sub r2, r0, #FDEC_STRIDE
454 vld1.8 {d0}, [r2,:64]
461 vrshrn.u16 d2, q0, #3
462 vrshrn.u16 d3, q0, #2
469 add r2, r0, r1, lsl #2
471 vst1.8 {d0}, [r0,:64], r1
472 vst1.8 {d1}, [r2,:64], r1
477 function x264_predict_8x8c_h_neon
481 vld1.8 {d0[]}, [r1], ip
482 vld1.8 {d2[]}, [r1], ip
483 vst1.64 {d0}, [r0,:64], ip
484 vst1.64 {d2}, [r0,:64], ip
489 function x264_predict_8x8c_v_neon
490 sub r0, r0, #FDEC_STRIDE
492 vld1.64 {d0}, [r0,:64], ip
494 vst1.64 {d0}, [r0,:64], ip
499 function x264_predict_8x8c_p_neon
500 sub r3, r0, #FDEC_STRIDE
504 vld1.32 {d0[0]}, [r3]
505 vld1.32 {d2[0]}, [r2,:32], r1
506 ldcol.8 d0, r3, r1, 4, hi=1
508 ldcol.8 d3, r3, r1, 4
514 vld1.16 {q0}, [r3,:128]
521 vrshrn.s32 d4, q2, #5
528 vadd.i16 d16, d16, d0
532 vext.16 q0, q0, q0, #7
535 vmul.i16 q0, q0, d4[0]
544 vqshrun.s16 d0, q1, #5
546 vst1.8 {d0}, [r0,:64], r1
553 function x264_predict_16x16_dc_top_neon
554 sub r2, r0, #FDEC_STRIDE
556 vld1.8 {q0}, [r2,:128]
557 add16x8 q0, d0, d1, d0, d1
558 vrshrn.u16 d0, q0, #4
563 function x264_predict_16x16_dc_left_neon
568 add16x8 q0, d0, d1, d0, d1
569 vrshrn.u16 d0, q0, #4
574 function x264_predict_16x16_dc_neon
575 sub r3, r0, #FDEC_STRIDE
577 vld1.64 {d0-d1}, [r3,:128]
578 ldrb ip, [r0], #FDEC_STRIDE
580 ldrb r1, [r0], #FDEC_STRIDE
585 ldrb r2, [r0], #FDEC_STRIDE
587 ldrb r3, [r0], #FDEC_STRIDE
589 ldrb r1, [r0], #FDEC_STRIDE
592 ldrb r2, [r0], #FDEC_STRIDE
594 ldrb r3, [r0], #FDEC_STRIDE
597 sub r0, r0, #FDEC_STRIDE*16
607 vst1.64 {d0-d1}, [r0,:128], r1
612 function x264_predict_16x16_h_neon
616 vld1.8 {d0[]}, [r1], ip
618 vld1.8 {d2[]}, [r1], ip
620 vst1.64 {d0-d1}, [r0,:128], ip
621 vst1.64 {d2-d3}, [r0,:128], ip
626 function x264_predict_16x16_v_neon
627 sub r0, r0, #FDEC_STRIDE
629 vld1.64 {d0-d1}, [r0,:128], ip
631 vst1.64 {d0-d1}, [r0,:128], ip
636 function x264_predict_16x16_p_neon
637 sub r3, r0, #FDEC_STRIDE
642 vld1.8 {d2}, [r2,:64], r1
651 vld1.8 {q0}, [r3,:128]
660 vrshrn.s32 d4, q2, #6
667 vadd.i16 d16, d16, d0
671 vext.16 q0, q0, q0, #7
674 vmul.i16 q0, q0, d4[0]
683 vqshrun.s16 d0, q1, #5
685 vqshrun.s16 d1, q1, #5
687 vst1.8 {q0}, [r0,:128], r1