2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
24 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
25 .macro h264_chroma_mc8 type, codec=h264
26 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
31 prfm pldl1strm, [x1, x2]
58 ld1 {v4.8B, v5.8B}, [x1], x2
61 ext v5.8B, v4.8B, v5.8B, #1
62 1: ld1 {v6.8B, v7.8B}, [x1], x2
63 umull v16.8H, v4.8B, v0.8B
64 umlal v16.8H, v5.8B, v1.8B
65 ext v7.8B, v6.8B, v7.8B, #1
66 ld1 {v4.8B, v5.8B}, [x1], x2
67 umlal v16.8H, v6.8B, v2.8B
69 ext v5.8B, v4.8B, v5.8B, #1
70 umlal v16.8H, v7.8B, v3.8B
71 umull v17.8H, v6.8B, v0.8B
73 umlal v17.8H, v7.8B, v1.8B
74 umlal v17.8H, v4.8B, v2.8B
75 umlal v17.8H, v5.8B, v3.8B
76 prfm pldl1strm, [x1, x2]
78 rshrn v16.8B, v16.8H, #6
79 rshrn v17.8B, v17.8H, #6
81 add v16.8H, v16.8H, v22.8H
82 add v17.8H, v17.8H, v22.8H
83 shrn v16.8B, v16.8H, #6
84 shrn v17.8B, v17.8H, #6
87 ld1 {v20.8B}, [x8], x2
88 ld1 {v21.8B}, [x8], x2
89 urhadd v16.8B, v16.8B, v20.8B
90 urhadd v17.8B, v17.8B, v21.8B
92 st1 {v16.8B}, [x0], x2
93 st1 {v17.8B}, [x0], x2
104 ld1 {v4.8B}, [x1], x2
105 3: ld1 {v6.8B}, [x1], x2
106 umull v16.8H, v4.8B, v0.8B
107 umlal v16.8H, v6.8B, v1.8B
108 ld1 {v4.8B}, [x1], x2
109 umull v17.8H, v6.8B, v0.8B
110 umlal v17.8H, v4.8B, v1.8B
113 rshrn v16.8B, v16.8H, #6
114 rshrn v17.8B, v17.8H, #6
116 add v16.8H, v16.8H, v22.8H
117 add v17.8H, v17.8H, v22.8H
118 shrn v16.8B, v16.8H, #6
119 shrn v17.8B, v17.8H, #6
121 prfm pldl1strm, [x1, x2]
123 ld1 {v20.8B}, [x8], x2
124 ld1 {v21.8B}, [x8], x2
125 urhadd v16.8B, v16.8B, v20.8B
126 urhadd v17.8B, v17.8B, v21.8B
129 st1 {v16.8B}, [x0], x2
130 st1 {v17.8B}, [x0], x2
134 4: ld1 {v4.8B, v5.8B}, [x1], x2
135 ld1 {v6.8B, v7.8B}, [x1], x2
136 ext v5.8B, v4.8B, v5.8B, #1
137 ext v7.8B, v6.8B, v7.8B, #1
140 umull v16.8H, v4.8B, v0.8B
141 umlal v16.8H, v5.8B, v1.8B
142 umull v17.8H, v6.8B, v0.8B
143 umlal v17.8H, v7.8B, v1.8B
144 prfm pldl1strm, [x1, x2]
146 rshrn v16.8B, v16.8H, #6
147 rshrn v17.8B, v17.8H, #6
149 add v16.8H, v16.8H, v22.8H
150 add v17.8H, v17.8H, v22.8H
151 shrn v16.8B, v16.8H, #6
152 shrn v17.8B, v17.8H, #6
155 ld1 {v20.8B}, [x8], x2
156 ld1 {v21.8B}, [x8], x2
157 urhadd v16.8B, v16.8B, v20.8B
158 urhadd v17.8B, v17.8B, v21.8B
160 st1 {v16.8B}, [x0], x2
161 st1 {v17.8B}, [x0], x2
165 5: ld1 {v4.8B}, [x1], x2
166 ld1 {v5.8B}, [x1], x2
169 umull v16.8H, v4.8B, v0.8B
170 umull v17.8H, v5.8B, v0.8B
171 prfm pldl1strm, [x1, x2]
173 rshrn v16.8B, v16.8H, #6
174 rshrn v17.8B, v17.8H, #6
176 add v16.8H, v16.8H, v22.8H
177 add v17.8H, v17.8H, v22.8H
178 shrn v16.8B, v16.8H, #6
179 shrn v17.8B, v17.8H, #6
182 ld1 {v20.8B}, [x8], x2
183 ld1 {v21.8B}, [x8], x2
184 urhadd v16.8B, v16.8B, v20.8B
185 urhadd v17.8B, v17.8B, v21.8B
187 st1 {v16.8B}, [x0], x2
188 st1 {v17.8B}, [x0], x2
194 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
195 .macro h264_chroma_mc4 type, codec=h264
196 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
201 prfm pldl1strm, [x1, x2]
228 ld1 {v4.8B}, [x1], x2
231 ext v5.8B, v4.8B, v5.8B, #1
232 trn1 v0.2S, v24.2S, v25.2S
233 trn1 v2.2S, v26.2S, v27.2S
234 trn1 v4.2S, v4.2S, v5.2S
235 1: ld1 {v6.8B}, [x1], x2
236 ext v7.8B, v6.8B, v7.8B, #1
237 trn1 v6.2S, v6.2S, v7.2S
238 umull v18.8H, v4.8B, v0.8B
239 umlal v18.8H, v6.8B, v2.8B
240 ld1 {v4.8B}, [x1], x2
241 ext v5.8B, v4.8B, v5.8B, #1
242 trn1 v4.2S, v4.2S, v5.2S
244 umull v19.8H, v6.8B, v0.8B
245 umlal v19.8H, v4.8B, v2.8B
246 trn1 v30.2D, v18.2D, v19.2D
247 trn2 v31.2D, v18.2D, v19.2D
248 add v18.8H, v30.8H, v31.8H
250 rshrn v16.8B, v18.8H, #6
252 add v18.8H, v18.8H, v22.8H
253 shrn v16.8B, v18.8H, #6
256 prfm pldl1strm, [x1, x2]
258 ld1 {v20.S}[0], [x8], x2
259 ld1 {v20.S}[1], [x8], x2
260 urhadd v16.8B, v16.8B, v20.8B
262 st1 {v16.S}[0], [x0], x2
263 st1 {v16.S}[1], [x0], x2
272 trn1 v0.2S, v30.2S, v31.2S
273 trn2 v1.2S, v30.2S, v31.2S
276 ext v1.8B, v0.8B, v1.8B, #4
277 ld1 {v4.S}[0], [x1], x2
278 3: ld1 {v4.S}[1], [x1], x2
279 umull v18.8H, v4.8B, v0.8B
280 ld1 {v4.S}[0], [x1], x2
281 umull v19.8H, v4.8B, v1.8B
282 trn1 v30.2D, v18.2D, v19.2D
283 trn2 v31.2D, v18.2D, v19.2D
284 add v18.8H, v30.8H, v31.8H
287 rshrn v16.8B, v18.8H, #6
289 add v18.8H, v18.8H, v22.8H
290 shrn v16.8B, v18.8H, #6
293 ld1 {v20.S}[0], [x8], x2
294 ld1 {v20.S}[1], [x8], x2
295 urhadd v16.8B, v16.8B, v20.8B
298 prfm pldl1strm, [x1, x2]
299 st1 {v16.S}[0], [x0], x2
300 st1 {v16.S}[1], [x0], x2
304 4: ld1 {v4.8B}, [x1], x2
305 ld1 {v6.8B}, [x1], x2
306 ext v5.8B, v4.8B, v5.8B, #1
307 ext v7.8B, v6.8B, v7.8B, #1
308 trn1 v4.2S, v4.2S, v5.2S
309 trn1 v6.2S, v6.2S, v7.2S
310 umull v18.8H, v4.8B, v0.8B
311 umull v19.8H, v6.8B, v0.8B
313 trn1 v30.2D, v18.2D, v19.2D
314 trn2 v31.2D, v18.2D, v19.2D
315 add v18.8H, v30.8H, v31.8H
318 rshrn v16.8B, v18.8H, #6
320 add v18.8H, v18.8H, v22.8H
321 shrn v16.8B, v18.8H, #6
324 ld1 {v20.S}[0], [x8], x2
325 ld1 {v20.S}[1], [x8], x2
326 urhadd v16.8B, v16.8B, v20.8B
329 st1 {v16.S}[0], [x0], x2
330 st1 {v16.S}[1], [x0], x2
334 5: ld1 {v4.S}[0], [x1], x2
335 ld1 {v4.S}[1], [x1], x2
336 umull v18.8H, v4.8B, v30.8B
340 rshrn v16.8B, v18.8H, #6
342 add v18.8H, v18.8H, v22.8H
343 shrn v16.8B, v18.8H, #6
346 ld1 {v20.S}[0], [x8], x2
347 ld1 {v20.S}[1], [x8], x2
348 urhadd v16.8B, v16.8B, v20.8B
351 st1 {v16.S}[0], [x0], x2
352 st1 {v16.S}[1], [x0], x2
358 .macro h264_chroma_mc2 type
359 function ff_\type\()_h264_chroma_mc2_neon, export=1
361 prfm pldl1strm, [x1, x2]
377 trn1 v0.4H, v0.4H, v2.4H
378 trn1 v1.4H, v1.4H, v3.4H
380 ld1 {v4.S}[0], [x1], x2
381 ld1 {v4.S}[1], [x1], x2
384 ext v6.8B, v4.8B, v5.8B, #1
385 ext v7.8B, v5.8B, v4.8B, #1
386 trn1 v4.4H, v4.4H, v6.4H
387 trn1 v5.4H, v5.4H, v7.4H
388 umull v16.8H, v4.8B, v0.8B
389 umlal v16.8H, v5.8B, v1.8B
391 ld1 {v18.H}[0], [x0], x2
396 add v16.8H, v16.8H, v17.8H
397 rshrn v16.8B, v16.8H, #6
399 urhadd v16.8B, v16.8B, v18.8B
401 st1 {v16.H}[0], [x0], x2
402 st1 {v16.H}[2], [x0], x2
408 ld1 {v16.H}[0], [x1], x2
409 ld1 {v16.H}[1], [x1], x2
411 ld1 {v18.H}[0], [x0], x2
414 urhadd v16.8B, v16.8B, v18.8B
416 st1 {v16.H}[0], [x0], x2
417 st1 {v16.H}[1], [x0], x2
431 #if CONFIG_RV40_DECODER
434 .short 32, 28, 32, 28
436 .short 32, 28, 32, 28
439 h264_chroma_mc8 put, rv40
440 h264_chroma_mc8 avg, rv40
441 h264_chroma_mc4 put, rv40
442 h264_chroma_mc4 avg, rv40
446 h264_chroma_mc8 put, vc1
447 h264_chroma_mc8 avg, vc1
448 h264_chroma_mc4 put, vc1
449 h264_chroma_mc4 avg, vc1