1 /*****************************************************************************
2 * predict.S: arm intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2011 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
34 p16weight: .short 1,2,3,4,5,6,7,8
38 .macro ldcol.8 rd, rs, rt, n=8, hi=0
39 .if \n == 8 || \hi == 0
40 vld1.8 {\rd[0]}, [\rs], \rt
41 vld1.8 {\rd[1]}, [\rs], \rt
42 vld1.8 {\rd[2]}, [\rs], \rt
43 vld1.8 {\rd[3]}, [\rs], \rt
45 .if \n == 8 || \hi == 1
46 vld1.8 {\rd[4]}, [\rs], \rt
47 vld1.8 {\rd[5]}, [\rs], \rt
48 vld1.8 {\rd[6]}, [\rs], \rt
49 vld1.8 {\rd[7]}, [\rs], \rt
53 .macro add16x8 dq, dl, dh, rl, rh
54 vaddl.u8 \dq, \rl, \rh
55 vadd.u16 \dl, \dl, \dh
56 vpadd.u16 \dl, \dl, \dl
57 vpadd.u16 \dl, \dl, \dl
61 // because gcc doesn't believe in using the free shift in add
62 function x264_predict_4x4_h_armv6
63 ldrb r1, [r0, #0*FDEC_STRIDE-1]
64 ldrb r2, [r0, #1*FDEC_STRIDE-1]
65 ldrb r3, [r0, #2*FDEC_STRIDE-1]
66 ldrb ip, [r0, #3*FDEC_STRIDE-1]
67 add r1, r1, r1, lsl #8
68 add r2, r2, r2, lsl #8
69 add r3, r3, r3, lsl #8
70 add ip, ip, ip, lsl #8
71 add r1, r1, r1, lsl #16
72 str r1, [r0, #0*FDEC_STRIDE]
73 add r2, r2, r2, lsl #16
74 str r2, [r0, #1*FDEC_STRIDE]
75 add r3, r3, r3, lsl #16
76 str r3, [r0, #2*FDEC_STRIDE]
77 add ip, ip, ip, lsl #16
78 str ip, [r0, #3*FDEC_STRIDE]
82 function x264_predict_4x4_dc_armv6
84 ldr r1, [r0, #-FDEC_STRIDE]
85 ldrb r2, [r0, #0*FDEC_STRIDE-1]
86 ldrb r3, [r0, #1*FDEC_STRIDE-1]
89 ldrb ip, [r0, #2*FDEC_STRIDE-1]
91 ldrb r3, [r0, #3*FDEC_STRIDE-1]
96 add r1, r1, r1, lsl #8
97 add r1, r1, r1, lsl #16
98 str r1, [r0, #0*FDEC_STRIDE]
99 str r1, [r0, #1*FDEC_STRIDE]
100 str r1, [r0, #2*FDEC_STRIDE]
101 str r1, [r0, #3*FDEC_STRIDE]
105 // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
106 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
119 function x264_predict_4x4_ddr_armv6
120 ldr r1, [r0, # -FDEC_STRIDE]
121 ldrb r2, [r0, # -FDEC_STRIDE-1]
122 ldrb r3, [r0, #0*FDEC_STRIDE-1]
124 add r2, r2, r1, lsl #8
125 ldrb r4, [r0, #1*FDEC_STRIDE-1]
126 add r3, r3, r2, lsl #8
127 ldrb r5, [r0, #2*FDEC_STRIDE-1]
128 ldrb r6, [r0, #3*FDEC_STRIDE-1]
129 add r4, r4, r3, lsl #8
130 add r5, r5, r4, lsl #8
131 add r6, r6, r5, lsl #8
133 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
134 str r1, [r0, #0*FDEC_STRIDE]
139 add r2, r2, r4, lsr #24
140 str r2, [r0, #1*FDEC_STRIDE]
141 add r3, r3, r4, lsr #16
142 str r3, [r0, #2*FDEC_STRIDE]
143 add r5, r5, r4, lsr #8
144 str r5, [r0, #3*FDEC_STRIDE]
148 function x264_predict_4x4_ddl_neon
151 vld1.64 {d0}, [r0], ip
153 vext.8 d1, d0, d0, #1
154 vext.8 d2, d0, d3, #2
157 vst1.32 {d0[0]}, [r0,:32], ip
158 vext.8 d1, d0, d0, #1
159 vext.8 d2, d0, d0, #2
160 vst1.32 {d1[0]}, [r0,:32], ip
161 vext.8 d3, d0, d0, #3
162 vst1.32 {d2[0]}, [r0,:32], ip
163 vst1.32 {d3[0]}, [r0,:32], ip
167 function x264_predict_8x8_dc_neon
176 usada8 r2, r4, ip, r2
178 usada8 r3, r5, ip, r3
186 vst1.64 {d0}, [r0,:64], ip
191 function x264_predict_8x8_h_neon
197 vst1.64 {d0}, [r0,:64], ip
199 vst1.64 {d1}, [r0,:64], ip
201 vst1.64 {d2}, [r0,:64], ip
203 vst1.64 {d3}, [r0,:64], ip
205 vst1.64 {d4}, [r0,:64], ip
207 vst1.64 {d5}, [r0,:64], ip
209 vst1.64 {d6}, [r0,:64], ip
210 vst1.64 {d7}, [r0,:64], ip
215 function x264_predict_8x8c_dc_top_neon
216 sub r2, r0, #FDEC_STRIDE
218 vld1.8 {d0}, [r2,:64]
221 vrshrn.u16 d0, q0, #2
228 function x264_predict_8x8c_dc_left_neon
234 vrshrn.u16 d0, q0, #2
240 function x264_predict_8x8c_dc_neon
241 sub r2, r0, #FDEC_STRIDE
243 vld1.8 {d0}, [r2,:64]
250 vrshrn.u16 d2, q0, #3
251 vrshrn.u16 d3, q0, #2
258 add r2, r0, r1, lsl #2
260 vst1.8 {d0}, [r0,:64], r1
261 vst1.8 {d1}, [r2,:64], r1
266 function x264_predict_8x8c_h_neon
270 vld1.8 {d0[]}, [r1], ip
271 vld1.8 {d2[]}, [r1], ip
272 vst1.64 {d0}, [r0,:64], ip
273 vst1.64 {d2}, [r0,:64], ip
278 function x264_predict_8x8c_v_neon
279 sub r0, r0, #FDEC_STRIDE
281 vld1.64 {d0}, [r0,:64], ip
283 vst1.64 {d0}, [r0,:64], ip
288 function x264_predict_8x8c_p_neon
289 sub r3, r0, #FDEC_STRIDE
293 vld1.32 {d0[0]}, [r3]
294 vld1.32 {d2[0]}, [r2,:32], r1
295 ldcol.8 d0, r3, r1, 4, hi=1
297 ldcol.8 d3, r3, r1, 4
303 vld1.16 {q0}, [r3,:128]
310 vrshrn.s32 d4, q2, #5
317 vadd.i16 d16, d16, d0
321 vext.16 q0, q0, q0, #7
324 vmul.i16 q0, q0, d4[0]
333 vqshrun.s16 d0, q1, #5
335 vst1.8 {d0}, [r0,:64], r1
342 function x264_predict_16x16_dc_top_neon
343 sub r2, r0, #FDEC_STRIDE
345 vld1.8 {q0}, [r2,:128]
346 add16x8 q0, d0, d1, d0, d1
347 vrshrn.u16 d0, q0, #4
352 function x264_predict_16x16_dc_left_neon
357 add16x8 q0, d0, d1, d0, d1
358 vrshrn.u16 d0, q0, #4
363 function x264_predict_16x16_dc_neon
364 sub r3, r0, #FDEC_STRIDE
366 vld1.64 {d0-d1}, [r3,:128]
367 ldrb ip, [r0], #FDEC_STRIDE
369 ldrb r1, [r0], #FDEC_STRIDE
374 ldrb r2, [r0], #FDEC_STRIDE
376 ldrb r3, [r0], #FDEC_STRIDE
378 ldrb r1, [r0], #FDEC_STRIDE
381 ldrb r2, [r0], #FDEC_STRIDE
383 ldrb r3, [r0], #FDEC_STRIDE
386 sub r0, r0, #FDEC_STRIDE*16
396 vst1.64 {d0-d1}, [r0,:128], r1
401 function x264_predict_16x16_h_neon
405 vld1.8 {d0[]}, [r1], ip
407 vld1.8 {d2[]}, [r1], ip
409 vst1.64 {d0-d1}, [r0,:128], ip
410 vst1.64 {d2-d3}, [r0,:128], ip
415 function x264_predict_16x16_v_neon
416 sub r0, r0, #FDEC_STRIDE
418 vld1.64 {d0-d1}, [r0,:128], ip
420 vst1.64 {d0-d1}, [r0,:128], ip
425 function x264_predict_16x16_p_neon
426 sub r3, r0, #FDEC_STRIDE
431 vld1.8 {d2}, [r2,:64], r1
440 vld1.8 {q0}, [r3,:128]
449 vrshrn.s32 d4, q2, #6
456 vadd.i16 d16, d16, d0
460 vext.16 q0, q0, q0, #7
463 vmul.i16 q0, q0, d4[0]
472 vqshrun.s16 d0, q1, #5
474 vqshrun.s16 d1, q1, #5
476 vst1.8 {q0}, [r0,:128], r1