2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
24 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
25 .macro h264_chroma_mc8 type, codec=h264
26 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
32 prfm pldl1strm, [x1, x2]
59 ld1 {v4.8B, v5.8B}, [x1], x2
62 ext v5.8B, v4.8B, v5.8B, #1
63 1: ld1 {v6.8B, v7.8B}, [x1], x2
64 umull v16.8H, v4.8B, v0.8B
65 umlal v16.8H, v5.8B, v1.8B
66 ext v7.8B, v6.8B, v7.8B, #1
67 ld1 {v4.8B, v5.8B}, [x1], x2
68 umlal v16.8H, v6.8B, v2.8B
70 ext v5.8B, v4.8B, v5.8B, #1
71 umlal v16.8H, v7.8B, v3.8B
72 umull v17.8H, v6.8B, v0.8B
74 umlal v17.8H, v7.8B, v1.8B
75 umlal v17.8H, v4.8B, v2.8B
76 umlal v17.8H, v5.8B, v3.8B
77 prfm pldl1strm, [x1, x2]
79 rshrn v16.8B, v16.8H, #6
80 rshrn v17.8B, v17.8H, #6
82 add v16.8H, v16.8H, v22.8H
83 add v17.8H, v17.8H, v22.8H
84 shrn v16.8B, v16.8H, #6
85 shrn v17.8B, v17.8H, #6
88 ld1 {v20.8B}, [x8], x2
89 ld1 {v21.8B}, [x8], x2
90 urhadd v16.8B, v16.8B, v20.8B
91 urhadd v17.8B, v17.8B, v21.8B
93 st1 {v16.8B}, [x0], x2
94 st1 {v17.8B}, [x0], x2
105 ld1 {v4.8B}, [x1], x2
106 3: ld1 {v6.8B}, [x1], x2
107 umull v16.8H, v4.8B, v0.8B
108 umlal v16.8H, v6.8B, v1.8B
109 ld1 {v4.8B}, [x1], x2
110 umull v17.8H, v6.8B, v0.8B
111 umlal v17.8H, v4.8B, v1.8B
114 rshrn v16.8B, v16.8H, #6
115 rshrn v17.8B, v17.8H, #6
117 add v16.8H, v16.8H, v22.8H
118 add v17.8H, v17.8H, v22.8H
119 shrn v16.8B, v16.8H, #6
120 shrn v17.8B, v17.8H, #6
122 prfm pldl1strm, [x1, x2]
124 ld1 {v20.8B}, [x8], x2
125 ld1 {v21.8B}, [x8], x2
126 urhadd v16.8B, v16.8B, v20.8B
127 urhadd v17.8B, v17.8B, v21.8B
130 st1 {v16.8B}, [x0], x2
131 st1 {v17.8B}, [x0], x2
135 4: ld1 {v4.8B, v5.8B}, [x1], x2
136 ld1 {v6.8B, v7.8B}, [x1], x2
137 ext v5.8B, v4.8B, v5.8B, #1
138 ext v7.8B, v6.8B, v7.8B, #1
141 umull v16.8H, v4.8B, v0.8B
142 umlal v16.8H, v5.8B, v1.8B
143 umull v17.8H, v6.8B, v0.8B
144 umlal v17.8H, v7.8B, v1.8B
145 prfm pldl1strm, [x1, x2]
147 rshrn v16.8B, v16.8H, #6
148 rshrn v17.8B, v17.8H, #6
150 add v16.8H, v16.8H, v22.8H
151 add v17.8H, v17.8H, v22.8H
152 shrn v16.8B, v16.8H, #6
153 shrn v17.8B, v17.8H, #6
156 ld1 {v20.8B}, [x8], x2
157 ld1 {v21.8B}, [x8], x2
158 urhadd v16.8B, v16.8B, v20.8B
159 urhadd v17.8B, v17.8B, v21.8B
161 st1 {v16.8B}, [x0], x2
162 st1 {v17.8B}, [x0], x2
166 5: ld1 {v4.8B}, [x1], x2
167 ld1 {v5.8B}, [x1], x2
170 umull v16.8H, v4.8B, v0.8B
171 umull v17.8H, v5.8B, v0.8B
172 prfm pldl1strm, [x1, x2]
174 rshrn v16.8B, v16.8H, #6
175 rshrn v17.8B, v17.8H, #6
177 add v16.8H, v16.8H, v22.8H
178 add v17.8H, v17.8H, v22.8H
179 shrn v16.8B, v16.8H, #6
180 shrn v17.8B, v17.8H, #6
183 ld1 {v20.8B}, [x8], x2
184 ld1 {v21.8B}, [x8], x2
185 urhadd v16.8B, v16.8B, v20.8B
186 urhadd v17.8B, v17.8B, v21.8B
188 st1 {v16.8B}, [x0], x2
189 st1 {v17.8B}, [x0], x2
195 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
196 .macro h264_chroma_mc4 type, codec=h264
197 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
203 prfm pldl1strm, [x1, x2]
230 ld1 {v4.8B}, [x1], x2
233 ext v5.8B, v4.8B, v5.8B, #1
234 trn1 v0.2S, v24.2S, v25.2S
235 trn1 v2.2S, v26.2S, v27.2S
236 trn1 v4.2S, v4.2S, v5.2S
237 1: ld1 {v6.8B}, [x1], x2
238 ext v7.8B, v6.8B, v7.8B, #1
239 trn1 v6.2S, v6.2S, v7.2S
240 umull v18.8H, v4.8B, v0.8B
241 umlal v18.8H, v6.8B, v2.8B
242 ld1 {v4.8B}, [x1], x2
243 ext v5.8B, v4.8B, v5.8B, #1
244 trn1 v4.2S, v4.2S, v5.2S
246 umull v19.8H, v6.8B, v0.8B
247 umlal v19.8H, v4.8B, v2.8B
248 trn1 v30.2D, v18.2D, v19.2D
249 trn2 v31.2D, v18.2D, v19.2D
250 add v18.8H, v30.8H, v31.8H
252 rshrn v16.8B, v18.8H, #6
254 add v18.8H, v18.8H, v22.8H
255 shrn v16.8B, v18.8H, #6
258 prfm pldl1strm, [x1, x2]
260 ld1 {v20.S}[0], [x8], x2
261 ld1 {v20.S}[1], [x8], x2
262 urhadd v16.8B, v16.8B, v20.8B
264 st1 {v16.S}[0], [x0], x2
265 st1 {v16.S}[1], [x0], x2
274 trn1 v0.2S, v30.2S, v31.2S
275 trn2 v1.2S, v30.2S, v31.2S
278 ext v1.8B, v0.8B, v1.8B, #4
279 ld1 {v4.S}[0], [x1], x2
280 3: ld1 {v4.S}[1], [x1], x2
281 umull v18.8H, v4.8B, v0.8B
282 ld1 {v4.S}[0], [x1], x2
283 umull v19.8H, v4.8B, v1.8B
284 trn1 v30.2D, v18.2D, v19.2D
285 trn2 v31.2D, v18.2D, v19.2D
286 add v18.8H, v30.8H, v31.8H
289 rshrn v16.8B, v18.8H, #6
291 add v18.8H, v18.8H, v22.8H
292 shrn v16.8B, v18.8H, #6
295 ld1 {v20.S}[0], [x8], x2
296 ld1 {v20.S}[1], [x8], x2
297 urhadd v16.8B, v16.8B, v20.8B
300 prfm pldl1strm, [x1, x2]
301 st1 {v16.S}[0], [x0], x2
302 st1 {v16.S}[1], [x0], x2
306 4: ld1 {v4.8B}, [x1], x2
307 ld1 {v6.8B}, [x1], x2
308 ext v5.8B, v4.8B, v5.8B, #1
309 ext v7.8B, v6.8B, v7.8B, #1
310 trn1 v4.2S, v4.2S, v5.2S
311 trn1 v6.2S, v6.2S, v7.2S
312 umull v18.8H, v4.8B, v0.8B
313 umull v19.8H, v6.8B, v0.8B
315 trn1 v30.2D, v18.2D, v19.2D
316 trn2 v31.2D, v18.2D, v19.2D
317 add v18.8H, v30.8H, v31.8H
320 rshrn v16.8B, v18.8H, #6
322 add v18.8H, v18.8H, v22.8H
323 shrn v16.8B, v18.8H, #6
326 ld1 {v20.S}[0], [x8], x2
327 ld1 {v20.S}[1], [x8], x2
328 urhadd v16.8B, v16.8B, v20.8B
331 st1 {v16.S}[0], [x0], x2
332 st1 {v16.S}[1], [x0], x2
336 5: ld1 {v4.S}[0], [x1], x2
337 ld1 {v4.S}[1], [x1], x2
338 umull v18.8H, v4.8B, v30.8B
342 rshrn v16.8B, v18.8H, #6
344 add v18.8H, v18.8H, v22.8H
345 shrn v16.8B, v18.8H, #6
348 ld1 {v20.S}[0], [x8], x2
349 ld1 {v20.S}[1], [x8], x2
350 urhadd v16.8B, v16.8B, v20.8B
353 st1 {v16.S}[0], [x0], x2
354 st1 {v16.S}[1], [x0], x2
360 .macro h264_chroma_mc2 type
361 function ff_\type\()_h264_chroma_mc2_neon, export=1
364 prfm pldl1strm, [x1, x2]
380 trn1 v0.4H, v0.4H, v2.4H
381 trn1 v1.4H, v1.4H, v3.4H
383 ld1 {v4.S}[0], [x1], x2
384 ld1 {v4.S}[1], [x1], x2
387 ext v6.8B, v4.8B, v5.8B, #1
388 ext v7.8B, v5.8B, v4.8B, #1
389 trn1 v4.4H, v4.4H, v6.4H
390 trn1 v5.4H, v5.4H, v7.4H
391 umull v16.8H, v4.8B, v0.8B
392 umlal v16.8H, v5.8B, v1.8B
394 ld1 {v18.H}[0], [x0], x2
399 add v16.8H, v16.8H, v17.8H
400 rshrn v16.8B, v16.8H, #6
402 urhadd v16.8B, v16.8B, v18.8B
404 st1 {v16.H}[0], [x0], x2
405 st1 {v16.H}[2], [x0], x2
411 ld1 {v16.H}[0], [x1], x2
412 ld1 {v16.H}[1], [x1], x2
414 ld1 {v18.H}[0], [x0], x2
417 urhadd v16.8B, v16.8B, v18.8B
419 st1 {v16.H}[0], [x0], x2
420 st1 {v16.H}[1], [x0], x2
434 #if CONFIG_RV40_DECODER
437 .short 32, 28, 32, 28
439 .short 32, 28, 32, 28
442 h264_chroma_mc8 put, rv40
443 h264_chroma_mc8 avg, rv40
444 h264_chroma_mc4 put, rv40
445 h264_chroma_mc4 avg, rv40
448 #if CONFIG_VC1_DECODER
449 h264_chroma_mc8 put, vc1
450 h264_chroma_mc8 avg, vc1
451 h264_chroma_mc4 put, vc1
452 h264_chroma_mc4 avg, vc1