]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/hpeldsp_neon.S
Merge commit '846c3d6aca5484904e60946c4fe8b8833bc07f92'
[ffmpeg] / libavcodec / arm / hpeldsp_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/arm/asm.S"
23
24 .macro  pixels16        rnd=1, avg=0
25   .if \avg
26         mov             r12, r0
27   .endif
28 1:      vld1.8          {q0},     [r1], r2
29         vld1.8          {q1},     [r1], r2
30         vld1.8          {q2},     [r1], r2
31         pld             [r1, r2, lsl #2]
32         vld1.8          {q3},     [r1], r2
33         pld             [r1]
34         pld             [r1, r2]
35         pld             [r1, r2, lsl #1]
36   .if \avg
37         vld1.8          {q8},     [r12,:128], r2
38         vrhadd.u8       q0,  q0,  q8
39         vld1.8          {q9},     [r12,:128], r2
40         vrhadd.u8       q1,  q1,  q9
41         vld1.8          {q10},    [r12,:128], r2
42         vrhadd.u8       q2,  q2,  q10
43         vld1.8          {q11},    [r12,:128], r2
44         vrhadd.u8       q3,  q3,  q11
45   .endif
46         subs            r3,  r3,  #4
47         vst1.64         {q0},     [r0,:128], r2
48         vst1.64         {q1},     [r0,:128], r2
49         vst1.64         {q2},     [r0,:128], r2
50         vst1.64         {q3},     [r0,:128], r2
51         bne             1b
52         bx              lr
53 .endm
54
55 .macro  pixels16_x2     rnd=1, avg=0
56 1:      vld1.8          {d0-d2},  [r1], r2
57         vld1.8          {d4-d6},  [r1], r2
58         pld             [r1]
59         pld             [r1, r2]
60         subs            r3,  r3,  #2
61         vext.8          q1,  q0,  q1,  #1
62         avg             q0,  q0,  q1
63         vext.8          q3,  q2,  q3,  #1
64         avg             q2,  q2,  q3
65   .if \avg
66         vld1.8          {q1},     [r0,:128], r2
67         vld1.8          {q3},     [r0,:128]
68         vrhadd.u8       q0,  q0,  q1
69         vrhadd.u8       q2,  q2,  q3
70         sub             r0,  r0,  r2
71   .endif
72         vst1.8          {q0},     [r0,:128], r2
73         vst1.8          {q2},     [r0,:128], r2
74         bne             1b
75         bx              lr
76 .endm
77
78 .macro  pixels16_y2     rnd=1, avg=0
79         sub             r3,  r3,  #2
80         vld1.8          {q0},     [r1], r2
81         vld1.8          {q1},     [r1], r2
82 1:      subs            r3,  r3,  #2
83         avg             q2,  q0,  q1
84         vld1.8          {q0},     [r1], r2
85         avg             q3,  q0,  q1
86         vld1.8          {q1},     [r1], r2
87         pld             [r1]
88         pld             [r1, r2]
89   .if \avg
90         vld1.8          {q8},     [r0,:128], r2
91         vld1.8          {q9},     [r0,:128]
92         vrhadd.u8       q2,  q2,  q8
93         vrhadd.u8       q3,  q3,  q9
94         sub             r0,  r0,  r2
95   .endif
96         vst1.8          {q2},     [r0,:128], r2
97         vst1.8          {q3},     [r0,:128], r2
98         bne             1b
99
100         avg             q2,  q0,  q1
101         vld1.8          {q0},     [r1], r2
102         avg             q3,  q0,  q1
103   .if \avg
104         vld1.8          {q8},     [r0,:128], r2
105         vld1.8          {q9},     [r0,:128]
106         vrhadd.u8       q2,  q2,  q8
107         vrhadd.u8       q3,  q3,  q9
108         sub             r0,  r0,  r2
109   .endif
110         vst1.8          {q2},     [r0,:128], r2
111         vst1.8          {q3},     [r0,:128], r2
112
113         bx              lr
114 .endm
115
116 .macro  pixels16_xy2    rnd=1, avg=0
117         sub             r3,  r3,  #2
118         vld1.8          {d0-d2},  [r1], r2
119         vld1.8          {d4-d6},  [r1], r2
120 NRND    vmov.i16        q13, #1
121         pld             [r1]
122         pld             [r1, r2]
123         vext.8          q1,  q0,  q1,  #1
124         vext.8          q3,  q2,  q3,  #1
125         vaddl.u8        q8,  d0,  d2
126         vaddl.u8        q10, d1,  d3
127         vaddl.u8        q9,  d4,  d6
128         vaddl.u8        q11, d5,  d7
129 1:      subs            r3,  r3,  #2
130         vld1.8          {d0-d2},  [r1], r2
131         vadd.u16        q12, q8,  q9
132         pld             [r1]
133 NRND    vadd.u16        q12, q12, q13
134         vext.8          q15, q0,  q1,  #1
135         vadd.u16        q1 , q10, q11
136         shrn            d28, q12, #2
137 NRND    vadd.u16        q1,  q1,  q13
138         shrn            d29, q1,  #2
139   .if \avg
140         vld1.8          {q8},     [r0,:128]
141         vrhadd.u8       q14, q14, q8
142   .endif
143         vaddl.u8        q8,  d0,  d30
144         vld1.8          {d2-d4},  [r1], r2
145         vaddl.u8        q10, d1,  d31
146         vst1.8          {q14},    [r0,:128], r2
147         vadd.u16        q12, q8,  q9
148         pld             [r1, r2]
149 NRND    vadd.u16        q12, q12, q13
150         vext.8          q2,  q1,  q2,  #1
151         vadd.u16        q0,  q10, q11
152         shrn            d30, q12, #2
153 NRND    vadd.u16        q0,  q0,  q13
154         shrn            d31, q0,  #2
155   .if \avg
156         vld1.8          {q9},     [r0,:128]
157         vrhadd.u8       q15, q15, q9
158   .endif
159         vaddl.u8        q9,  d2,  d4
160         vaddl.u8        q11, d3,  d5
161         vst1.8          {q15},    [r0,:128], r2
162         bgt             1b
163
164         vld1.8          {d0-d2},  [r1], r2
165         vadd.u16        q12, q8,  q9
166 NRND    vadd.u16        q12, q12, q13
167         vext.8          q15, q0,  q1,  #1
168         vadd.u16        q1 , q10, q11
169         shrn            d28, q12, #2
170 NRND    vadd.u16        q1,  q1,  q13
171         shrn            d29, q1,  #2
172   .if \avg
173         vld1.8          {q8},     [r0,:128]
174         vrhadd.u8       q14, q14, q8
175   .endif
176         vaddl.u8        q8,  d0,  d30
177         vaddl.u8        q10, d1,  d31
178         vst1.8          {q14},    [r0,:128], r2
179         vadd.u16        q12, q8,  q9
180 NRND    vadd.u16        q12, q12, q13
181         vadd.u16        q0,  q10, q11
182         shrn            d30, q12, #2
183 NRND    vadd.u16        q0,  q0,  q13
184         shrn            d31, q0,  #2
185   .if \avg
186         vld1.8          {q9},     [r0,:128]
187         vrhadd.u8       q15, q15, q9
188   .endif
189         vst1.8          {q15},    [r0,:128], r2
190
191         bx              lr
192 .endm
193
194 .macro  pixels8         rnd=1, avg=0
195 1:      vld1.8          {d0},     [r1], r2
196         vld1.8          {d1},     [r1], r2
197         vld1.8          {d2},     [r1], r2
198         pld             [r1, r2, lsl #2]
199         vld1.8          {d3},     [r1], r2
200         pld             [r1]
201         pld             [r1, r2]
202         pld             [r1, r2, lsl #1]
203   .if \avg
204         vld1.8          {d4},     [r0,:64], r2
205         vrhadd.u8       d0,  d0,  d4
206         vld1.8          {d5},     [r0,:64], r2
207         vrhadd.u8       d1,  d1,  d5
208         vld1.8          {d6},     [r0,:64], r2
209         vrhadd.u8       d2,  d2,  d6
210         vld1.8          {d7},     [r0,:64], r2
211         vrhadd.u8       d3,  d3,  d7
212         sub             r0,  r0,  r2,  lsl #2
213   .endif
214         subs            r3,  r3,  #4
215         vst1.8          {d0},     [r0,:64], r2
216         vst1.8          {d1},     [r0,:64], r2
217         vst1.8          {d2},     [r0,:64], r2
218         vst1.8          {d3},     [r0,:64], r2
219         bne             1b
220         bx              lr
221 .endm
222
223 .macro  pixels8_x2      rnd=1, avg=0
224 1:      vld1.8          {q0},     [r1], r2
225         vext.8          d1,  d0,  d1,  #1
226         vld1.8          {q1},     [r1], r2
227         vext.8          d3,  d2,  d3,  #1
228         pld             [r1]
229         pld             [r1, r2]
230         subs            r3,  r3,  #2
231         vswp            d1,  d2
232         avg             q0,  q0,  q1
233   .if \avg
234         vld1.8          {d4},     [r0,:64], r2
235         vld1.8          {d5},     [r0,:64]
236         vrhadd.u8       q0,  q0,  q2
237         sub             r0,  r0,  r2
238   .endif
239         vst1.8          {d0},     [r0,:64], r2
240         vst1.8          {d1},     [r0,:64], r2
241         bne             1b
242         bx              lr
243 .endm
244
245 .macro  pixels8_y2      rnd=1, avg=0
246         sub             r3,  r3,  #2
247         vld1.8          {d0},     [r1], r2
248         vld1.8          {d1},     [r1], r2
249 1:      subs            r3,  r3,  #2
250         avg             d4,  d0,  d1
251         vld1.8          {d0},     [r1], r2
252         avg             d5,  d0,  d1
253         vld1.8          {d1},     [r1], r2
254         pld             [r1]
255         pld             [r1, r2]
256   .if \avg
257         vld1.8          {d2},     [r0,:64], r2
258         vld1.8          {d3},     [r0,:64]
259         vrhadd.u8       q2,  q2,  q1
260         sub             r0,  r0,  r2
261   .endif
262         vst1.8          {d4},     [r0,:64], r2
263         vst1.8          {d5},     [r0,:64], r2
264         bne             1b
265
266         avg             d4,  d0,  d1
267         vld1.8          {d0},     [r1], r2
268         avg             d5,  d0,  d1
269   .if \avg
270         vld1.8          {d2},     [r0,:64], r2
271         vld1.8          {d3},     [r0,:64]
272         vrhadd.u8       q2,  q2,  q1
273         sub             r0,  r0,  r2
274   .endif
275         vst1.8          {d4},     [r0,:64], r2
276         vst1.8          {d5},     [r0,:64], r2
277
278         bx              lr
279 .endm
280
281 .macro  pixels8_xy2     rnd=1, avg=0
282         sub             r3,  r3,  #2
283         vld1.8          {q0},     [r1], r2
284         vld1.8          {q1},     [r1], r2
285 NRND    vmov.i16        q11, #1
286         pld             [r1]
287         pld             [r1, r2]
288         vext.8          d4,  d0,  d1,  #1
289         vext.8          d6,  d2,  d3,  #1
290         vaddl.u8        q8,  d0,  d4
291         vaddl.u8        q9,  d2,  d6
292 1:      subs            r3,  r3,  #2
293         vld1.8          {q0},     [r1], r2
294         pld             [r1]
295         vadd.u16        q10, q8,  q9
296         vext.8          d4,  d0,  d1,  #1
297 NRND    vadd.u16        q10, q10, q11
298         vaddl.u8        q8,  d0,  d4
299         shrn            d5,  q10, #2
300         vld1.8          {q1},     [r1], r2
301         vadd.u16        q10, q8,  q9
302         pld             [r1, r2]
303   .if \avg
304         vld1.8          {d7},     [r0,:64]
305         vrhadd.u8       d5,  d5,  d7
306   .endif
307 NRND    vadd.u16        q10, q10, q11
308         vst1.8          {d5},     [r0,:64], r2
309         shrn            d7,  q10, #2
310   .if \avg
311         vld1.8          {d5},     [r0,:64]
312         vrhadd.u8       d7,  d7,  d5
313   .endif
314         vext.8          d6,  d2,  d3,  #1
315         vaddl.u8        q9,  d2,  d6
316         vst1.8          {d7},     [r0,:64], r2
317         bgt             1b
318
319         vld1.8          {q0},     [r1], r2
320         vadd.u16        q10, q8,  q9
321         vext.8          d4,  d0,  d1,  #1
322 NRND    vadd.u16        q10, q10, q11
323         vaddl.u8        q8,  d0,  d4
324         shrn            d5,  q10, #2
325         vadd.u16        q10, q8,  q9
326   .if \avg
327         vld1.8          {d7},     [r0,:64]
328         vrhadd.u8       d5,  d5,  d7
329   .endif
330 NRND    vadd.u16        q10, q10, q11
331         vst1.8          {d5},     [r0,:64], r2
332         shrn            d7,  q10, #2
333   .if \avg
334         vld1.8          {d5},     [r0,:64]
335         vrhadd.u8       d7,  d7,  d5
336   .endif
337         vst1.8          {d7},     [r0,:64], r2
338
339         bx              lr
340 .endm
341
342 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
343   .if \rnd
344     .macro avg  rd, rn, rm
345         vrhadd.u8       \rd, \rn, \rm
346     .endm
347     .macro shrn rd, rn, rm
348         vrshrn.u16      \rd, \rn, \rm
349     .endm
350     .macro NRND insn:vararg
351     .endm
352   .else
353     .macro avg  rd, rn, rm
354         vhadd.u8        \rd, \rn, \rm
355     .endm
356     .macro shrn rd, rn, rm
357         vshrn.u16       \rd, \rn, \rm
358     .endm
359     .macro NRND insn:vararg
360         \insn
361     .endm
362   .endif
363 function ff_\pfx\name\suf\()_neon, export=1
364         \name           \rnd, \avg
365 endfunc
366         .purgem         avg
367         .purgem         shrn
368         .purgem         NRND
369 .endm
370
371 .macro  pixfunc2        pfx, name, avg=0
372         pixfunc         \pfx, \name,          rnd=1, avg=\avg
373         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
374 .endm
375
376 function ff_put_h264_qpel16_mc00_neon, export=1
377         mov             r3,  #16
378 endfunc
379
380         pixfunc         put_, pixels16,     avg=0
381         pixfunc2        put_, pixels16_x2,  avg=0
382         pixfunc2        put_, pixels16_y2,  avg=0
383         pixfunc2        put_, pixels16_xy2, avg=0
384
385 function ff_avg_h264_qpel16_mc00_neon, export=1
386         mov             r3,  #16
387 endfunc
388
389         pixfunc         avg_, pixels16,     avg=1
390         pixfunc2        avg_, pixels16_x2,  avg=1
391         pixfunc2        avg_, pixels16_y2,  avg=1
392         pixfunc2        avg_, pixels16_xy2, avg=1
393
394 function ff_put_h264_qpel8_mc00_neon, export=1
395         mov             r3,  #8
396 endfunc
397
398         pixfunc         put_, pixels8,     avg=0
399         pixfunc2        put_, pixels8_x2,  avg=0
400         pixfunc2        put_, pixels8_y2,  avg=0
401         pixfunc2        put_, pixels8_xy2, avg=0
402
403 function ff_avg_h264_qpel8_mc00_neon, export=1
404         mov             r3,  #8
405 endfunc
406
407         pixfunc         avg_, pixels8,     avg=1
408         pixfunc         avg_, pixels8_x2,  avg=1
409         pixfunc         avg_, pixels8_y2,  avg=1
410         pixfunc         avg_, pixels8_xy2, avg=1