2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 .macro pixels16 rnd=1, avg=0
28 1: vld1.8 {q0}, [r1], r2
37 vld1.8 {q8}, [r12,:128], r2
39 vld1.8 {q9}, [r12,:128], r2
41 vld1.8 {q10}, [r12,:128], r2
43 vld1.8 {q11}, [r12,:128], r2
47 vst1.64 {q0}, [r0,:128], r2
48 vst1.64 {q1}, [r0,:128], r2
49 vst1.64 {q2}, [r0,:128], r2
50 vst1.64 {q3}, [r0,:128], r2
55 .macro pixels16_x2 rnd=1, avg=0
56 1: vld1.8 {d0-d2}, [r1], r2
57 vld1.8 {d4-d6}, [r1], r2
66 vld1.8 {q1}, [r0,:128], r2
67 vld1.8 {q3}, [r0,:128]
72 vst1.8 {q0}, [r0,:128], r2
73 vst1.8 {q2}, [r0,:128], r2
78 .macro pixels16_y2 rnd=1, avg=0
90 vld1.8 {q8}, [r0,:128], r2
91 vld1.8 {q9}, [r0,:128]
96 vst1.8 {q2}, [r0,:128], r2
97 vst1.8 {q3}, [r0,:128], r2
101 vld1.8 {q0}, [r1], r2
104 vld1.8 {q8}, [r0,:128], r2
105 vld1.8 {q9}, [r0,:128]
110 vst1.8 {q2}, [r0,:128], r2
111 vst1.8 {q3}, [r0,:128], r2
116 .macro pixels16_xy2 rnd=1, avg=0
118 vld1.8 {d0-d2}, [r1], r2
119 vld1.8 {d4-d6}, [r1], r2
120 NRND vmov.i16 q13, #1
123 vext.8 q1, q0, q1, #1
124 vext.8 q3, q2, q3, #1
130 vld1.8 {d0-d2}, [r1], r2
133 NRND vadd.u16 q12, q12, q13
134 vext.8 q15, q0, q1, #1
135 vadd.u16 q1 , q10, q11
137 NRND vadd.u16 q1, q1, q13
140 vld1.8 {q8}, [r0,:128]
141 vrhadd.u8 q14, q14, q8
144 vld1.8 {d2-d4}, [r1], r2
145 vaddl.u8 q10, d1, d31
146 vst1.8 {q14}, [r0,:128], r2
149 NRND vadd.u16 q12, q12, q13
150 vext.8 q2, q1, q2, #1
151 vadd.u16 q0, q10, q11
153 NRND vadd.u16 q0, q0, q13
156 vld1.8 {q9}, [r0,:128]
157 vrhadd.u8 q15, q15, q9
161 vst1.8 {q15}, [r0,:128], r2
164 vld1.8 {d0-d2}, [r1], r2
166 NRND vadd.u16 q12, q12, q13
167 vext.8 q15, q0, q1, #1
168 vadd.u16 q1 , q10, q11
170 NRND vadd.u16 q1, q1, q13
173 vld1.8 {q8}, [r0,:128]
174 vrhadd.u8 q14, q14, q8
177 vaddl.u8 q10, d1, d31
178 vst1.8 {q14}, [r0,:128], r2
180 NRND vadd.u16 q12, q12, q13
181 vadd.u16 q0, q10, q11
183 NRND vadd.u16 q0, q0, q13
186 vld1.8 {q9}, [r0,:128]
187 vrhadd.u8 q15, q15, q9
189 vst1.8 {q15}, [r0,:128], r2
194 .macro pixels8 rnd=1, avg=0
195 1: vld1.8 {d0}, [r1], r2
196 vld1.8 {d1}, [r1], r2
197 vld1.8 {d2}, [r1], r2
199 vld1.8 {d3}, [r1], r2
204 vld1.8 {d4}, [r0,:64], r2
206 vld1.8 {d5}, [r0,:64], r2
208 vld1.8 {d6}, [r0,:64], r2
210 vld1.8 {d7}, [r0,:64], r2
212 sub r0, r0, r2, lsl #2
215 vst1.8 {d0}, [r0,:64], r2
216 vst1.8 {d1}, [r0,:64], r2
217 vst1.8 {d2}, [r0,:64], r2
218 vst1.8 {d3}, [r0,:64], r2
223 .macro pixels8_x2 rnd=1, avg=0
224 1: vld1.8 {q0}, [r1], r2
225 vext.8 d1, d0, d1, #1
226 vld1.8 {q1}, [r1], r2
227 vext.8 d3, d2, d3, #1
234 vld1.8 {d4}, [r0,:64], r2
235 vld1.8 {d5}, [r0,:64]
239 vst1.8 {d0}, [r0,:64], r2
240 vst1.8 {d1}, [r0,:64], r2
245 .macro pixels8_y2 rnd=1, avg=0
247 vld1.8 {d0}, [r1], r2
248 vld1.8 {d1}, [r1], r2
251 vld1.8 {d0}, [r1], r2
253 vld1.8 {d1}, [r1], r2
257 vld1.8 {d2}, [r0,:64], r2
258 vld1.8 {d3}, [r0,:64]
262 vst1.8 {d4}, [r0,:64], r2
263 vst1.8 {d5}, [r0,:64], r2
267 vld1.8 {d0}, [r1], r2
270 vld1.8 {d2}, [r0,:64], r2
271 vld1.8 {d3}, [r0,:64]
275 vst1.8 {d4}, [r0,:64], r2
276 vst1.8 {d5}, [r0,:64], r2
281 .macro pixels8_xy2 rnd=1, avg=0
283 vld1.8 {q0}, [r1], r2
284 vld1.8 {q1}, [r1], r2
285 NRND vmov.i16 q11, #1
288 vext.8 d4, d0, d1, #1
289 vext.8 d6, d2, d3, #1
293 vld1.8 {q0}, [r1], r2
296 vext.8 d4, d0, d1, #1
297 NRND vadd.u16 q10, q10, q11
300 vld1.8 {q1}, [r1], r2
304 vld1.8 {d7}, [r0,:64]
307 NRND vadd.u16 q10, q10, q11
308 vst1.8 {d5}, [r0,:64], r2
311 vld1.8 {d5}, [r0,:64]
314 vext.8 d6, d2, d3, #1
316 vst1.8 {d7}, [r0,:64], r2
319 vld1.8 {q0}, [r1], r2
321 vext.8 d4, d0, d1, #1
322 NRND vadd.u16 q10, q10, q11
327 vld1.8 {d7}, [r0,:64]
330 NRND vadd.u16 q10, q10, q11
331 vst1.8 {d5}, [r0,:64], r2
334 vld1.8 {d5}, [r0,:64]
337 vst1.8 {d7}, [r0,:64], r2
342 .macro pixfunc pfx, name, suf, rnd=1, avg=0
344 .macro avg rd, rn, rm
345 vrhadd.u8 \rd, \rn, \rm
347 .macro shrn rd, rn, rm
348 vrshrn.u16 \rd, \rn, \rm
350 .macro NRND insn:vararg
353 .macro avg rd, rn, rm
354 vhadd.u8 \rd, \rn, \rm
356 .macro shrn rd, rn, rm
357 vshrn.u16 \rd, \rn, \rm
359 .macro NRND insn:vararg
363 function ff_\pfx\name\suf\()_neon, export=1
371 .macro pixfunc2 pfx, name, avg=0
372 pixfunc \pfx, \name, rnd=1, avg=\avg
373 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
376 function ff_put_h264_qpel16_mc00_neon, export=1
380 pixfunc put_, pixels16, avg=0
381 pixfunc2 put_, pixels16_x2, avg=0
382 pixfunc2 put_, pixels16_y2, avg=0
383 pixfunc2 put_, pixels16_xy2, avg=0
385 function ff_avg_h264_qpel16_mc00_neon, export=1
389 pixfunc avg_, pixels16, avg=1
390 pixfunc2 avg_, pixels16_x2, avg=1
391 pixfunc2 avg_, pixels16_y2, avg=1
392 pixfunc2 avg_, pixels16_xy2, avg=1
394 function ff_put_h264_qpel8_mc00_neon, export=1
398 pixfunc put_, pixels8, avg=0
399 pixfunc2 put_, pixels8_x2, avg=0
400 pixfunc2 put_, pixels8_y2, avg=0
401 pixfunc2 put_, pixels8_xy2, avg=0
403 function ff_avg_h264_qpel8_mc00_neon, export=1
407 pixfunc avg_, pixels8, avg=1
408 pixfunc avg_, pixels8_x2, avg=1
409 pixfunc avg_, pixels8_y2, avg=1
410 pixfunc avg_, pixels8_xy2, avg=1