]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
a9b3a3d8b3bb014b192cb2e53c9f4a931028f1b1
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/arm/asm.S"
23
24 function ff_clear_block_neon, export=1
25         vmov.i16        q0,  #0
26         .rept           8
27         vst1.16         {q0}, [r0,:128]!
28         .endr
29         bx              lr
30 endfunc
31
32 function ff_clear_blocks_neon, export=1
33         vmov.i16        q0,  #0
34         .rept           8*6
35         vst1.16         {q0}, [r0,:128]!
36         .endr
37         bx              lr
38 endfunc
39
40 .macro  pixels16        rnd=1, avg=0
41   .if \avg
42         mov             r12, r0
43   .endif
44 1:      vld1.8          {q0},     [r1], r2
45         vld1.8          {q1},     [r1], r2
46         vld1.8          {q2},     [r1], r2
47         pld             [r1, r2, lsl #2]
48         vld1.8          {q3},     [r1], r2
49         pld             [r1]
50         pld             [r1, r2]
51         pld             [r1, r2, lsl #1]
52   .if \avg
53         vld1.8          {q8},     [r12,:128], r2
54         vrhadd.u8       q0,  q0,  q8
55         vld1.8          {q9},     [r12,:128], r2
56         vrhadd.u8       q1,  q1,  q9
57         vld1.8          {q10},    [r12,:128], r2
58         vrhadd.u8       q2,  q2,  q10
59         vld1.8          {q11},    [r12,:128], r2
60         vrhadd.u8       q3,  q3,  q11
61   .endif
62         subs            r3,  r3,  #4
63         vst1.64         {q0},     [r0,:128], r2
64         vst1.64         {q1},     [r0,:128], r2
65         vst1.64         {q2},     [r0,:128], r2
66         vst1.64         {q3},     [r0,:128], r2
67         bne             1b
68         bx              lr
69 .endm
70
71 .macro  pixels16_x2     rnd=1, avg=0
72 1:      vld1.8          {d0-d2},  [r1], r2
73         vld1.8          {d4-d6},  [r1], r2
74         pld             [r1]
75         pld             [r1, r2]
76         subs            r3,  r3,  #2
77         vext.8          q1,  q0,  q1,  #1
78         avg             q0,  q0,  q1
79         vext.8          q3,  q2,  q3,  #1
80         avg             q2,  q2,  q3
81   .if \avg
82         vld1.8          {q1},     [r0,:128], r2
83         vld1.8          {q3},     [r0,:128]
84         vrhadd.u8       q0,  q0,  q1
85         vrhadd.u8       q2,  q2,  q3
86         sub             r0,  r0,  r2
87   .endif
88         vst1.8          {q0},     [r0,:128], r2
89         vst1.8          {q2},     [r0,:128], r2
90         bne             1b
91         bx              lr
92 .endm
93
94 .macro  pixels16_y2     rnd=1, avg=0
95         sub             r3,  r3,  #2
96         vld1.8          {q0},     [r1], r2
97         vld1.8          {q1},     [r1], r2
98 1:      subs            r3,  r3,  #2
99         avg             q2,  q0,  q1
100         vld1.8          {q0},     [r1], r2
101         avg             q3,  q0,  q1
102         vld1.8          {q1},     [r1], r2
103         pld             [r1]
104         pld             [r1, r2]
105   .if \avg
106         vld1.8          {q8},     [r0,:128], r2
107         vld1.8          {q9},     [r0,:128]
108         vrhadd.u8       q2,  q2,  q8
109         vrhadd.u8       q3,  q3,  q9
110         sub             r0,  r0,  r2
111   .endif
112         vst1.8          {q2},     [r0,:128], r2
113         vst1.8          {q3},     [r0,:128], r2
114         bne             1b
115
116         avg             q2,  q0,  q1
117         vld1.8          {q0},     [r1], r2
118         avg             q3,  q0,  q1
119   .if \avg
120         vld1.8          {q8},     [r0,:128], r2
121         vld1.8          {q9},     [r0,:128]
122         vrhadd.u8       q2,  q2,  q8
123         vrhadd.u8       q3,  q3,  q9
124         sub             r0,  r0,  r2
125   .endif
126         vst1.8          {q2},     [r0,:128], r2
127         vst1.8          {q3},     [r0,:128], r2
128
129         bx              lr
130 .endm
131
132 .macro  pixels16_xy2    rnd=1, avg=0
133         sub             r3,  r3,  #2
134         vld1.8          {d0-d2},  [r1], r2
135         vld1.8          {d4-d6},  [r1], r2
136 NRND    vmov.i16        q13, #1
137         pld             [r1]
138         pld             [r1, r2]
139         vext.8          q1,  q0,  q1,  #1
140         vext.8          q3,  q2,  q3,  #1
141         vaddl.u8        q8,  d0,  d2
142         vaddl.u8        q10, d1,  d3
143         vaddl.u8        q9,  d4,  d6
144         vaddl.u8        q11, d5,  d7
145 1:      subs            r3,  r3,  #2
146         vld1.8          {d0-d2},  [r1], r2
147         vadd.u16        q12, q8,  q9
148         pld             [r1]
149 NRND    vadd.u16        q12, q12, q13
150         vext.8          q15, q0,  q1,  #1
151         vadd.u16        q1 , q10, q11
152         shrn            d28, q12, #2
153 NRND    vadd.u16        q1,  q1,  q13
154         shrn            d29, q1,  #2
155   .if \avg
156         vld1.8          {q8},     [r0,:128]
157         vrhadd.u8       q14, q14, q8
158   .endif
159         vaddl.u8        q8,  d0,  d30
160         vld1.8          {d2-d4},  [r1], r2
161         vaddl.u8        q10, d1,  d31
162         vst1.8          {q14},    [r0,:128], r2
163         vadd.u16        q12, q8,  q9
164         pld             [r1, r2]
165 NRND    vadd.u16        q12, q12, q13
166         vext.8          q2,  q1,  q2,  #1
167         vadd.u16        q0,  q10, q11
168         shrn            d30, q12, #2
169 NRND    vadd.u16        q0,  q0,  q13
170         shrn            d31, q0,  #2
171   .if \avg
172         vld1.8          {q9},     [r0,:128]
173         vrhadd.u8       q15, q15, q9
174   .endif
175         vaddl.u8        q9,  d2,  d4
176         vaddl.u8        q11, d3,  d5
177         vst1.8          {q15},    [r0,:128], r2
178         bgt             1b
179
180         vld1.8          {d0-d2},  [r1], r2
181         vadd.u16        q12, q8,  q9
182 NRND    vadd.u16        q12, q12, q13
183         vext.8          q15, q0,  q1,  #1
184         vadd.u16        q1 , q10, q11
185         shrn            d28, q12, #2
186 NRND    vadd.u16        q1,  q1,  q13
187         shrn            d29, q1,  #2
188   .if \avg
189         vld1.8          {q8},     [r0,:128]
190         vrhadd.u8       q14, q14, q8
191   .endif
192         vaddl.u8        q8,  d0,  d30
193         vaddl.u8        q10, d1,  d31
194         vst1.8          {q14},    [r0,:128], r2
195         vadd.u16        q12, q8,  q9
196 NRND    vadd.u16        q12, q12, q13
197         vadd.u16        q0,  q10, q11
198         shrn            d30, q12, #2
199 NRND    vadd.u16        q0,  q0,  q13
200         shrn            d31, q0,  #2
201   .if \avg
202         vld1.8          {q9},     [r0,:128]
203         vrhadd.u8       q15, q15, q9
204   .endif
205         vst1.8          {q15},    [r0,:128], r2
206
207         bx              lr
208 .endm
209
210 .macro  pixels8         rnd=1, avg=0
211 1:      vld1.8          {d0},     [r1], r2
212         vld1.8          {d1},     [r1], r2
213         vld1.8          {d2},     [r1], r2
214         pld             [r1, r2, lsl #2]
215         vld1.8          {d3},     [r1], r2
216         pld             [r1]
217         pld             [r1, r2]
218         pld             [r1, r2, lsl #1]
219   .if \avg
220         vld1.8          {d4},     [r0,:64], r2
221         vrhadd.u8       d0,  d0,  d4
222         vld1.8          {d5},     [r0,:64], r2
223         vrhadd.u8       d1,  d1,  d5
224         vld1.8          {d6},     [r0,:64], r2
225         vrhadd.u8       d2,  d2,  d6
226         vld1.8          {d7},     [r0,:64], r2
227         vrhadd.u8       d3,  d3,  d7
228         sub             r0,  r0,  r2,  lsl #2
229   .endif
230         subs            r3,  r3,  #4
231         vst1.8          {d0},     [r0,:64], r2
232         vst1.8          {d1},     [r0,:64], r2
233         vst1.8          {d2},     [r0,:64], r2
234         vst1.8          {d3},     [r0,:64], r2
235         bne             1b
236         bx              lr
237 .endm
238
239 .macro  pixels8_x2      rnd=1, avg=0
240 1:      vld1.8          {q0},     [r1], r2
241         vext.8          d1,  d0,  d1,  #1
242         vld1.8          {q1},     [r1], r2
243         vext.8          d3,  d2,  d3,  #1
244         pld             [r1]
245         pld             [r1, r2]
246         subs            r3,  r3,  #2
247         vswp            d1,  d2
248         avg             q0,  q0,  q1
249   .if \avg
250         vld1.8          {d4},     [r0,:64], r2
251         vld1.8          {d5},     [r0,:64]
252         vrhadd.u8       q0,  q0,  q2
253         sub             r0,  r0,  r2
254   .endif
255         vst1.8          {d0},     [r0,:64], r2
256         vst1.8          {d1},     [r0,:64], r2
257         bne             1b
258         bx              lr
259 .endm
260
261 .macro  pixels8_y2      rnd=1, avg=0
262         sub             r3,  r3,  #2
263         vld1.8          {d0},     [r1], r2
264         vld1.8          {d1},     [r1], r2
265 1:      subs            r3,  r3,  #2
266         avg             d4,  d0,  d1
267         vld1.8          {d0},     [r1], r2
268         avg             d5,  d0,  d1
269         vld1.8          {d1},     [r1], r2
270         pld             [r1]
271         pld             [r1, r2]
272   .if \avg
273         vld1.8          {d2},     [r0,:64], r2
274         vld1.8          {d3},     [r0,:64]
275         vrhadd.u8       q2,  q2,  q1
276         sub             r0,  r0,  r2
277   .endif
278         vst1.8          {d4},     [r0,:64], r2
279         vst1.8          {d5},     [r0,:64], r2
280         bne             1b
281
282         avg             d4,  d0,  d1
283         vld1.8          {d0},     [r1], r2
284         avg             d5,  d0,  d1
285   .if \avg
286         vld1.8          {d2},     [r0,:64], r2
287         vld1.8          {d3},     [r0,:64]
288         vrhadd.u8       q2,  q2,  q1
289         sub             r0,  r0,  r2
290   .endif
291         vst1.8          {d4},     [r0,:64], r2
292         vst1.8          {d5},     [r0,:64], r2
293
294         bx              lr
295 .endm
296
297 .macro  pixels8_xy2     rnd=1, avg=0
298         sub             r3,  r3,  #2
299         vld1.8          {q0},     [r1], r2
300         vld1.8          {q1},     [r1], r2
301 NRND    vmov.i16        q11, #1
302         pld             [r1]
303         pld             [r1, r2]
304         vext.8          d4,  d0,  d1,  #1
305         vext.8          d6,  d2,  d3,  #1
306         vaddl.u8        q8,  d0,  d4
307         vaddl.u8        q9,  d2,  d6
308 1:      subs            r3,  r3,  #2
309         vld1.8          {q0},     [r1], r2
310         pld             [r1]
311         vadd.u16        q10, q8,  q9
312         vext.8          d4,  d0,  d1,  #1
313 NRND    vadd.u16        q10, q10, q11
314         vaddl.u8        q8,  d0,  d4
315         shrn            d5,  q10, #2
316         vld1.8          {q1},     [r1], r2
317         vadd.u16        q10, q8,  q9
318         pld             [r1, r2]
319   .if \avg
320         vld1.8          {d7},     [r0,:64]
321         vrhadd.u8       d5,  d5,  d7
322   .endif
323 NRND    vadd.u16        q10, q10, q11
324         vst1.8          {d5},     [r0,:64], r2
325         shrn            d7,  q10, #2
326   .if \avg
327         vld1.8          {d5},     [r0,:64]
328         vrhadd.u8       d7,  d7,  d5
329   .endif
330         vext.8          d6,  d2,  d3,  #1
331         vaddl.u8        q9,  d2,  d6
332         vst1.8          {d7},     [r0,:64], r2
333         bgt             1b
334
335         vld1.8          {q0},     [r1], r2
336         vadd.u16        q10, q8,  q9
337         vext.8          d4,  d0,  d1,  #1
338 NRND    vadd.u16        q10, q10, q11
339         vaddl.u8        q8,  d0,  d4
340         shrn            d5,  q10, #2
341         vadd.u16        q10, q8,  q9
342   .if \avg
343         vld1.8          {d7},     [r0,:64]
344         vrhadd.u8       d5,  d5,  d7
345   .endif
346 NRND    vadd.u16        q10, q10, q11
347         vst1.8          {d5},     [r0,:64], r2
348         shrn            d7,  q10, #2
349   .if \avg
350         vld1.8          {d5},     [r0,:64]
351         vrhadd.u8       d7,  d7,  d5
352   .endif
353         vst1.8          {d7},     [r0,:64], r2
354
355         bx              lr
356 .endm
357
358 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
359   .if \rnd
360     .macro avg  rd, rn, rm
361         vrhadd.u8       \rd, \rn, \rm
362     .endm
363     .macro shrn rd, rn, rm
364         vrshrn.u16      \rd, \rn, \rm
365     .endm
366     .macro NRND insn:vararg
367     .endm
368   .else
369     .macro avg  rd, rn, rm
370         vhadd.u8        \rd, \rn, \rm
371     .endm
372     .macro shrn rd, rn, rm
373         vshrn.u16       \rd, \rn, \rm
374     .endm
375     .macro NRND insn:vararg
376         \insn
377     .endm
378   .endif
379 function ff_\pfx\name\suf\()_neon, export=1
380         \name           \rnd, \avg
381 endfunc
382         .purgem         avg
383         .purgem         shrn
384         .purgem         NRND
385 .endm
386
387 .macro  pixfunc2        pfx, name, avg=0
388         pixfunc         \pfx, \name,          rnd=1, avg=\avg
389         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
390 .endm
391
392 function ff_put_h264_qpel16_mc00_neon, export=1
393         mov             r3,  #16
394 endfunc
395
396         pixfunc         put_, pixels16,     avg=0
397         pixfunc2        put_, pixels16_x2,  avg=0
398         pixfunc2        put_, pixels16_y2,  avg=0
399         pixfunc2        put_, pixels16_xy2, avg=0
400
401 function ff_avg_h264_qpel16_mc00_neon, export=1
402         mov             r3,  #16
403 endfunc
404
405         pixfunc         avg_, pixels16,     avg=1
406         pixfunc2        avg_, pixels16_x2,  avg=1
407         pixfunc2        avg_, pixels16_y2,  avg=1
408         pixfunc2        avg_, pixels16_xy2, avg=1
409
410 function ff_put_h264_qpel8_mc00_neon, export=1
411         mov             r3,  #8
412 endfunc
413
414         pixfunc         put_, pixels8,     avg=0
415         pixfunc2        put_, pixels8_x2,  avg=0
416         pixfunc2        put_, pixels8_y2,  avg=0
417         pixfunc2        put_, pixels8_xy2, avg=0
418
419 function ff_avg_h264_qpel8_mc00_neon, export=1
420         mov             r3,  #8
421 endfunc
422
423         pixfunc         avg_, pixels8,     avg=1
424         pixfunc2        avg_, pixels8_x2,  avg=1
425         pixfunc2        avg_, pixels8_y2,  avg=1
426         pixfunc2        avg_, pixels8_xy2, avg=1
427
428 function ff_put_pixels_clamped_neon, export=1
429         vld1.16         {d16-d19}, [r0,:128]!
430         vqmovun.s16     d0, q8
431         vld1.16         {d20-d23}, [r0,:128]!
432         vqmovun.s16     d1, q9
433         vld1.16         {d24-d27}, [r0,:128]!
434         vqmovun.s16     d2, q10
435         vld1.16         {d28-d31}, [r0,:128]!
436         vqmovun.s16     d3, q11
437         vst1.8          {d0},      [r1,:64], r2
438         vqmovun.s16     d4, q12
439         vst1.8          {d1},      [r1,:64], r2
440         vqmovun.s16     d5, q13
441         vst1.8          {d2},      [r1,:64], r2
442         vqmovun.s16     d6, q14
443         vst1.8          {d3},      [r1,:64], r2
444         vqmovun.s16     d7, q15
445         vst1.8          {d4},      [r1,:64], r2
446         vst1.8          {d5},      [r1,:64], r2
447         vst1.8          {d6},      [r1,:64], r2
448         vst1.8          {d7},      [r1,:64], r2
449         bx              lr
450 endfunc
451
452 function ff_put_signed_pixels_clamped_neon, export=1
453         vmov.u8         d31, #128
454         vld1.16         {d16-d17}, [r0,:128]!
455         vqmovn.s16      d0, q8
456         vld1.16         {d18-d19}, [r0,:128]!
457         vqmovn.s16      d1, q9
458         vld1.16         {d16-d17}, [r0,:128]!
459         vqmovn.s16      d2, q8
460         vld1.16         {d18-d19}, [r0,:128]!
461         vadd.u8         d0, d0, d31
462         vld1.16         {d20-d21}, [r0,:128]!
463         vadd.u8         d1, d1, d31
464         vld1.16         {d22-d23}, [r0,:128]!
465         vadd.u8         d2, d2, d31
466         vst1.8          {d0},      [r1,:64], r2
467         vqmovn.s16      d3, q9
468         vst1.8          {d1},      [r1,:64], r2
469         vqmovn.s16      d4, q10
470         vst1.8          {d2},      [r1,:64], r2
471         vqmovn.s16      d5, q11
472         vld1.16         {d24-d25}, [r0,:128]!
473         vadd.u8         d3, d3, d31
474         vld1.16         {d26-d27}, [r0,:128]!
475         vadd.u8         d4, d4, d31
476         vadd.u8         d5, d5, d31
477         vst1.8          {d3},      [r1,:64], r2
478         vqmovn.s16      d6, q12
479         vst1.8          {d4},      [r1,:64], r2
480         vqmovn.s16      d7, q13
481         vst1.8          {d5},      [r1,:64], r2
482         vadd.u8         d6, d6, d31
483         vadd.u8         d7, d7, d31
484         vst1.8          {d6},      [r1,:64], r2
485         vst1.8          {d7},      [r1,:64], r2
486         bx              lr
487 endfunc
488
489 function ff_add_pixels_clamped_neon, export=1
490         mov             r3, r1
491         vld1.8          {d16},   [r1,:64], r2
492         vld1.16         {d0-d1}, [r0,:128]!
493         vaddw.u8        q0, q0, d16
494         vld1.8          {d17},   [r1,:64], r2
495         vld1.16         {d2-d3}, [r0,:128]!
496         vqmovun.s16     d0, q0
497         vld1.8          {d18},   [r1,:64], r2
498         vaddw.u8        q1, q1, d17
499         vld1.16         {d4-d5}, [r0,:128]!
500         vaddw.u8        q2, q2, d18
501         vst1.8          {d0},    [r3,:64], r2
502         vqmovun.s16     d2, q1
503         vld1.8          {d19},   [r1,:64], r2
504         vld1.16         {d6-d7}, [r0,:128]!
505         vaddw.u8        q3, q3, d19
506         vqmovun.s16     d4, q2
507         vst1.8          {d2},    [r3,:64], r2
508         vld1.8          {d16},   [r1,:64], r2
509         vqmovun.s16     d6, q3
510         vld1.16         {d0-d1}, [r0,:128]!
511         vaddw.u8        q0, q0, d16
512         vst1.8          {d4},    [r3,:64], r2
513         vld1.8          {d17},   [r1,:64], r2
514         vld1.16         {d2-d3}, [r0,:128]!
515         vaddw.u8        q1, q1, d17
516         vst1.8          {d6},    [r3,:64], r2
517         vqmovun.s16     d0, q0
518         vld1.8          {d18},   [r1,:64], r2
519         vld1.16         {d4-d5}, [r0,:128]!
520         vaddw.u8        q2, q2, d18
521         vst1.8          {d0},    [r3,:64], r2
522         vqmovun.s16     d2, q1
523         vld1.8          {d19},   [r1,:64], r2
524         vqmovun.s16     d4, q2
525         vld1.16         {d6-d7}, [r0,:128]!
526         vaddw.u8        q3, q3, d19
527         vst1.8          {d2},    [r3,:64], r2
528         vqmovun.s16     d6, q3
529         vst1.8          {d4},    [r3,:64], r2
530         vst1.8          {d6},    [r3,:64], r2
531         bx              lr
532 endfunc
533
534 function ff_scalarproduct_float_neon, export=1
535         vmov.f32        q2,  #0.0
536 1:      vld1.32         {q0},[r0,:128]!
537         vld1.32         {q1},[r1,:128]!
538         vmla.f32        q2,  q0,  q1
539         subs            r2,  r2,  #4
540         bgt             1b
541         vadd.f32        d0,  d4,  d5
542         vpadd.f32       d0,  d0,  d0
543 NOVFP   vmov.32         r0,  d0[0]
544         bx              lr
545 endfunc
546
547 function ff_vector_clipf_neon, export=1
548 VFP     vdup.32         q1,  d0[1]
549 VFP     vdup.32         q0,  d0[0]
550 NOVFP   vdup.32         q0,  r2
551 NOVFP   vdup.32         q1,  r3
552 NOVFP   ldr             r2,  [sp]
553         vld1.f32        {q2},[r1,:128]!
554         vmin.f32        q10, q2,  q1
555         vld1.f32        {q3},[r1,:128]!
556         vmin.f32        q11, q3,  q1
557 1:      vmax.f32        q8,  q10, q0
558         vmax.f32        q9,  q11, q0
559         subs            r2,  r2,  #8
560         beq             2f
561         vld1.f32        {q2},[r1,:128]!
562         vmin.f32        q10, q2,  q1
563         vld1.f32        {q3},[r1,:128]!
564         vmin.f32        q11, q3,  q1
565         vst1.f32        {q8},[r0,:128]!
566         vst1.f32        {q9},[r0,:128]!
567         b               1b
568 2:      vst1.f32        {q8},[r0,:128]!
569         vst1.f32        {q9},[r0,:128]!
570         bx              lr
571 endfunc
572
573 function ff_apply_window_int16_neon, export=1
574         push            {r4,lr}
575         add             r4,  r1,  r3,  lsl #1
576         add             lr,  r0,  r3,  lsl #1
577         sub             r4,  r4,  #16
578         sub             lr,  lr,  #16
579         mov             r12, #-16
580 1:
581         vld1.16         {q0},     [r1,:128]!
582         vld1.16         {q2},     [r2,:128]!
583         vld1.16         {q1},     [r4,:128], r12
584         vrev64.16       q3,  q2
585         vqrdmulh.s16    q0,  q0,  q2
586         vqrdmulh.s16    d2,  d2,  d7
587         vqrdmulh.s16    d3,  d3,  d6
588         vst1.16         {q0},     [r0,:128]!
589         vst1.16         {q1},     [lr,:128], r12
590         subs            r3,  r3,  #16
591         bgt             1b
592
593         pop             {r4,pc}
594 endfunc
595
596 function ff_vector_clip_int32_neon, export=1
597         vdup.32         q0,  r2
598         vdup.32         q1,  r3
599         ldr             r2,  [sp]
600 1:
601         vld1.32         {q2-q3},  [r1,:128]!
602         vmin.s32        q2,  q2,  q1
603         vmin.s32        q3,  q3,  q1
604         vmax.s32        q2,  q2,  q0
605         vmax.s32        q3,  q3,  q0
606         vst1.32         {q2-q3},  [r0,:128]!
607         subs            r2,  r2,  #8
608         bgt             1b
609         bx              lr
610 endfunc