2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
24 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
25 .macro h264_chroma_mc8 type, codec=h264
26 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
32 prfm pldl1strm, [x1, x2]
59 ld1 {v4.8B, v5.8B}, [x1], x2
62 ext v5.8B, v4.8B, v5.8B, #1
63 1: ld1 {v6.8B, v7.8B}, [x1], x2
64 umull v16.8H, v4.8B, v0.8B
65 umlal v16.8H, v5.8B, v1.8B
66 ext v7.8B, v6.8B, v7.8B, #1
67 ld1 {v4.8B, v5.8B}, [x1], x2
68 umlal v16.8H, v6.8B, v2.8B
70 ext v5.8B, v4.8B, v5.8B, #1
71 umlal v16.8H, v7.8B, v3.8B
72 umull v17.8H, v6.8B, v0.8B
74 umlal v17.8H, v7.8B, v1.8B
75 umlal v17.8H, v4.8B, v2.8B
76 umlal v17.8H, v5.8B, v3.8B
77 prfm pldl1strm, [x1, x2]
79 rshrn v16.8B, v16.8H, #6
80 rshrn v17.8B, v17.8H, #6
82 add v16.8H, v16.8H, v22.8H
83 add v17.8H, v17.8H, v22.8H
84 shrn v16.8B, v16.8H, #6
85 shrn v17.8B, v17.8H, #6
88 ld1 {v20.8B}, [x8], x2
89 ld1 {v21.8B}, [x8], x2
90 urhadd v16.8B, v16.8B, v20.8B
91 urhadd v17.8B, v17.8B, v21.8B
93 st1 {v16.8B}, [x0], x2
94 st1 {v17.8B}, [x0], x2
104 ld1 {v4.8B}, [x1], x2
105 3: ld1 {v6.8B}, [x1], x2
106 umull v16.8H, v4.8B, v0.8B
107 umlal v16.8H, v6.8B, v1.8B
108 ld1 {v4.8B}, [x1], x2
109 umull v17.8H, v6.8B, v0.8B
110 umlal v17.8H, v4.8B, v1.8B
113 rshrn v16.8B, v16.8H, #6
114 rshrn v17.8B, v17.8H, #6
116 add v16.8H, v16.8H, v22.8H
117 add v17.8H, v17.8H, v22.8H
118 shrn v16.8B, v16.8H, #6
119 shrn v17.8B, v17.8H, #6
121 prfm pldl1strm, [x1, x2]
123 ld1 {v20.8B}, [x8], x2
124 ld1 {v21.8B}, [x8], x2
125 urhadd v16.8B, v16.8B, v20.8B
126 urhadd v17.8B, v17.8B, v21.8B
129 st1 {v16.8B}, [x0], x2
130 st1 {v17.8B}, [x0], x2
134 4: ld1 {v4.8B, v5.8B}, [x1], x2
135 ld1 {v6.8B, v7.8B}, [x1], x2
136 ext v5.8B, v4.8B, v5.8B, #1
137 ext v7.8B, v6.8B, v7.8B, #1
140 umull v16.8H, v4.8B, v0.8B
141 umlal v16.8H, v5.8B, v1.8B
142 umull v17.8H, v6.8B, v0.8B
143 umlal v17.8H, v7.8B, v1.8B
144 prfm pldl1strm, [x1, x2]
146 rshrn v16.8B, v16.8H, #6
147 rshrn v17.8B, v17.8H, #6
149 add v16.8H, v16.8H, v22.8H
150 add v17.8H, v17.8H, v22.8H
151 shrn v16.8B, v16.8H, #6
152 shrn v17.8B, v17.8H, #6
155 ld1 {v20.8B}, [x8], x2
156 ld1 {v21.8B}, [x8], x2
157 urhadd v16.8B, v16.8B, v20.8B
158 urhadd v17.8B, v17.8B, v21.8B
160 st1 {v16.8B}, [x0], x2
161 st1 {v17.8B}, [x0], x2
167 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
168 .macro h264_chroma_mc4 type, codec=h264
169 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
175 prfm pldl1strm, [x1, x2]
202 ld1 {v4.8B}, [x1], x2
205 ext v5.8B, v4.8B, v5.8B, #1
206 trn1 v0.2S, v24.2S, v25.2S
207 trn1 v2.2S, v26.2S, v27.2S
208 trn1 v4.2S, v4.2S, v5.2S
209 1: ld1 {v6.8B}, [x1], x2
210 ext v7.8B, v6.8B, v7.8B, #1
211 trn1 v6.2S, v6.2S, v7.2S
212 umull v18.8H, v4.8B, v0.8B
213 umlal v18.8H, v6.8B, v2.8B
214 ld1 {v4.8B}, [x1], x2
215 ext v5.8B, v4.8B, v5.8B, #1
216 trn1 v4.2S, v4.2S, v5.2S
218 umull v19.8H, v6.8B, v0.8B
219 umlal v19.8H, v4.8B, v2.8B
220 trn1 v30.2D, v18.2D, v19.2D
221 trn2 v31.2D, v18.2D, v19.2D
222 add v18.8H, v30.8H, v31.8H
224 rshrn v16.8B, v18.8H, #6
226 add v18.8H, v18.8H, v22.8H
227 shrn v16.8B, v18.8H, #6
230 prfm pldl1strm, [x1, x2]
232 ld1 {v20.S}[0], [x8], x2
233 ld1 {v20.S}[1], [x8], x2
234 urhadd v16.8B, v16.8B, v20.8B
236 st1 {v16.S}[0], [x0], x2
237 st1 {v16.S}[1], [x0], x2
245 trn1 v0.2S, v30.2S, v31.2S
246 trn2 v1.2S, v30.2S, v31.2S
249 ext v1.8B, v0.8B, v1.8B, #4
250 ld1 {v4.S}[0], [x1], x2
251 3: ld1 {v4.S}[1], [x1], x2
252 umull v18.8H, v4.8B, v0.8B
253 ld1 {v4.S}[0], [x1], x2
254 umull v19.8H, v4.8B, v1.8B
255 trn1 v30.2D, v18.2D, v19.2D
256 trn2 v31.2D, v18.2D, v19.2D
257 add v18.8H, v30.8H, v31.8H
260 rshrn v16.8B, v18.8H, #6
262 add v18.8H, v18.8H, v22.8H
263 shrn v16.8B, v18.8H, #6
266 ld1 {v20.S}[0], [x8], x2
267 ld1 {v20.S}[1], [x8], x2
268 urhadd v16.8B, v16.8B, v20.8B
271 prfm pldl1strm, [x1, x2]
272 st1 {v16.S}[0], [x0], x2
273 st1 {v16.S}[1], [x0], x2
277 4: ld1 {v4.8B}, [x1], x2
278 ld1 {v6.8B}, [x1], x2
279 ext v5.8B, v4.8B, v5.8B, #1
280 ext v7.8B, v6.8B, v7.8B, #1
281 trn1 v4.2S, v4.2S, v5.2S
282 trn1 v6.2S, v6.2S, v7.2S
283 umull v18.8H, v4.8B, v0.8B
284 umull v19.8H, v6.8B, v0.8B
286 trn1 v30.2D, v18.2D, v19.2D
287 trn2 v31.2D, v18.2D, v19.2D
288 add v18.8H, v30.8H, v31.8H
291 rshrn v16.8B, v18.8H, #6
293 add v18.8H, v18.8H, v22.8H
294 shrn v16.8B, v18.8H, #6
297 ld1 {v20.S}[0], [x8], x2
298 ld1 {v20.S}[1], [x8], x2
299 urhadd v16.8B, v16.8B, v20.8B
302 st1 {v16.S}[0], [x0], x2
303 st1 {v16.S}[1], [x0], x2
309 .macro h264_chroma_mc2 type
310 function ff_\type\()_h264_chroma_mc2_neon, export=1
313 prfm pldl1strm, [x1, x2]
329 trn1 v0.4H, v0.4H, v2.4H
330 trn1 v1.4H, v1.4H, v3.4H
332 ld1 {v4.S}[0], [x1], x2
333 ld1 {v4.S}[1], [x1], x2
336 ext v6.8B, v4.8B, v5.8B, #1
337 ext v7.8B, v5.8B, v4.8B, #1
338 trn1 v4.4H, v4.4H, v6.4H
339 trn1 v5.4H, v5.4H, v7.4H
340 umull v16.8H, v4.8B, v0.8B
341 umlal v16.8H, v5.8B, v1.8B
343 ld1 {v18.H}[0], [x0], x2
348 add v16.8H, v16.8H, v17.8H
349 rshrn v16.8B, v16.8H, #6
351 urhadd v16.8B, v16.8B, v18.8B
353 st1 {v16.H}[0], [x0], x2
354 st1 {v16.H}[2], [x0], x2
360 ld1 {v16.H}[0], [x1], x2
361 ld1 {v16.H}[1], [x1], x2
363 ld1 {v18.H}[0], [x0], x2
366 urhadd v16.8B, v16.8B, v18.8B
368 st1 {v16.H}[0], [x0], x2
369 st1 {v16.H}[1], [x0], x2
383 #if CONFIG_RV40_DECODER
386 .short 32, 28, 32, 28
388 .short 32, 28, 32, 28
391 h264_chroma_mc8 put, rv40
392 h264_chroma_mc8 avg, rv40
393 h264_chroma_mc4 put, rv40
394 h264_chroma_mc4 avg, rv40
397 #if CONFIG_VC1_DECODER
398 h264_chroma_mc8 put, vc1
399 h264_chroma_mc8 avg, vc1
400 h264_chroma_mc4 put, vc1
401 h264_chroma_mc4 avg, vc1