2 * VP8 ARMv6 optimisations
4 * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
5 * Copyright (c) 2010 Rob Clark <rob@ti.com>
6 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * This code was partially ported from libvpx, which uses this license:
26 * Use of this source code is governed by a BSD-style license
27 * that can be found in the LICENSE file in the root of the source
28 * tree. An additional intellectual property rights grant can be found
29 * in the file PATENTS. All contributing project authors may
30 * be found in the AUTHORS file in the root of the source tree.
32 * (Note that the "LICENSE", "AUTHORS" and "PATENTS" files can be
33 * found in the libvpx source tree.)
40 @ void vp8_luma_dc_wht(DCTELEM block[4][4][16], DCTELEM dc[16])
41 function ff_vp8_luma_dc_wht_armv6, export=1
44 @ load dc[] and zero memory
46 ldr r2, [r1] @ dc0[0,1]
47 ldr r3, [r1, #4] @ dc0[2,3]
48 ldr r4, [r1, #8] @ dc1[0,1]
49 ldr r5, [r1, #12] @ dc1[2,3]
50 ldr r6, [r1, #16] @ dc2[0,1]
51 ldr r7, [r1, #20] @ dc2[2,3]
52 ldr r8, [r1, #24] @ dc3[0,1]
53 ldr r9, [r1, #28] @ dc3[2,3]
64 uadd16 r12, r2, r8 @ t0[0,1]
65 uadd16 r14, r3, r9 @ t0[2,3]
66 usub16 r2, r2, r8 @ t3[0,1]
67 usub16 r3, r3, r9 @ t3[2,3]
68 uadd16 r8, r4, r6 @ t1[0,1]
69 uadd16 r9, r5, r7 @ t1[2,3]
70 usub16 r4, r4, r6 @ t2[0,1]
71 usub16 r5, r5, r7 @ t2[2,3]
73 uadd16 r6, r12, r8 @ dc0[0,1]
74 uadd16 r7, r14, r9 @ dc0[2,3]
75 usub16 r12, r12, r8 @ dc2[0,1]
76 usub16 r14, r14, r9 @ dc2[2,3]
77 uadd16 r8, r2, r4 @ dc1[0,1]
78 uadd16 r9, r3, r5 @ dc1[2,3]
79 usub16 r2, r2, r4 @ dc3[0,1]
80 usub16 r3, r3, r5 @ dc3[2,3]
83 orr r1, r1, #0x30000 @ 3 | 3 (round)
86 pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0]
87 pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1]
88 pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0]
89 pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1]
90 pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2]
93 pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3]
94 pkhbt r2, r14, r3, lsl #16 @ dc{2,3}[2]
95 pkhtb r14, r3, r14, asr #16 @ dc{2,3}[3]
98 uadd16 r9, r4, r7 @ t0[0,1]
99 uadd16 r3, r5, r14 @ t0[2,3]
100 usub16 r4, r4, r7 @ t3[0,1]
101 usub16 r5, r5, r14 @ t3[2,3]
102 uadd16 r7, r6, r8 @ t1[0,1]
103 uadd16 r14, r12, r2 @ t1[2,3]
104 usub16 r6, r6, r8 @ t2[0,1]
105 usub16 r12, r12, r2 @ t2[2,3]
107 uadd16 r8, r9, r7 @ block[0,1][0]
108 uadd16 r2, r3, r14 @ block[2,3][0]
109 usub16 r9, r9, r7 @ block[0,1][2]
110 usub16 r3, r3, r14 @ block[2,3][2]
111 uadd16 r7, r4, r6 @ block[0,1][1]
112 uadd16 r14, r5, r12 @ block[2,3][1]
113 usub16 r4, r4, r6 @ block[0,1][3]
114 usub16 r5, r5, r12 @ block[2,3][3]
117 mov r6, r8, asr #19 @ block[1][0]
118 mov r12, r7, asr #19 @ block[1][1]
119 mov r1, r9, asr #19 @ block[1][2]
120 mov r10, r4, asr #19 @ block[1][3]
125 asr r8, #3 @ block[0][0]
126 asr r7, #3 @ block[0][1]
127 asr r9, #3 @ block[0][2]
128 asr r4, #3 @ block[0][3]
139 mov r6, r2, asr #19 @ block[3][0]
140 mov r12, r14, asr #19 @ block[3][1]
141 mov r1, r3, asr #19 @ block[3][2]
142 mov r10, r5, asr #19 @ block[3][3]
147 asr r2, #3 @ block[2][0]
148 asr r14, #3 @ block[2][1]
149 asr r3, #3 @ block[2][2]
150 asr r5, #3 @ block[2][3]
164 @ void vp8_luma_dc_wht_dc(DCTELEM block[4][4][16], DCTELEM dc[16])
165 function ff_vp8_luma_dc_wht_dc_armv6, export=1
177 @ void vp8_idct_add(uint8_t *dst, DCTELEM block[16], int stride)
178 function ff_vp8_idct_add_armv6, export=1
182 mov r3, #0x00004E00 @ cos
183 orr r3, r3, #0x0000007B @ cospi8sqrt2minus1 = 20091
184 mov r4, #0x00008A00 @ sin
185 orr r4, r4, #0x0000008C @ sinpi8sqrt2 = 35468
188 ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block1[0]
189 ldr r12,[r1, #24] @ i13 | i12 = block3[1] | block3[0]
190 ldr r14,[r1, #16] @ i9 | i8 = block2[1] | block2[0]
192 smulwt r9, r3, r6 @ (ip[5] * cospi8sqrt2minus1) >> 16
193 smulwb r7, r3, r6 @ (ip[4] * cospi8sqrt2minus1) >> 16
194 smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16
195 smulwb r8, r4, r6 @ (ip[4] * sinpi8sqrt2) >> 16
196 pkhbt r7, r7, r9, lsl #16 @ 5c | 4c
197 smulwt r11, r3, r12 @ (ip[13] * cospi8sqrt2minus1) >> 16
198 pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first half
199 uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first half
200 smulwt r7, r4, r12 @ (ip[13] * sinpi8sqrt2) >> 16
201 smulwb r9, r3, r12 @ (ip[12] * cospi8sqrt2minus1) >> 16
202 smulwb r10, r4, r12 @ (ip[12] * sinpi8sqrt2) >> 16
204 subs r5, r5, #1 @ i--
205 pkhbt r9, r9, r11, lsl #16 @ 13c | 12c
206 ldr r11,[r1] @ i1 | i0
207 pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second half
208 uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 second half
209 usub16 r7, r8, r7 @ c = t2
210 uadd16 r6, r6, r10 @ d = t3
211 uadd16 r10, r11, r14 @ a = t0
212 usub16 r8, r11, r14 @ b = t1
213 uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0]
214 usub16 r10, r10, r6 @ a-d = tmp{0,1}[3]
215 uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1]
216 usub16 r7, r8, r7 @ b-c = tmp{0,1}[2]
218 str r6, [sp, #8] @ o5 | o4
219 str r7, [sp, #16] @ o9 | o8
220 str r10,[sp, #24] @ o13 | o12
221 str r9, [sp], #4 @ o1 | o0
231 ldr r6, [sp, #8] @ i5 | i4 = tmp{0,1}[1]
232 ldr r14,[sp, #4] @ i3 | i2 = tmp{2,3}[0]
233 ldr r12,[sp, #12] @ i7 | i6 = tmp{2,3}[1]
234 ldr r1, [sp], #16 @ i1 | i0 = tmp{0,1}[0]
235 smulwt r9, r3, r6 @ (ip[5] * cospi8sqrt2minus1) >> 16
236 smulwt r7, r3, r1 @ (ip[1] * cospi8sqrt2minus1) >> 16
237 smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16
238 smulwt r8, r4, r1 @ (ip[1] * sinpi8sqrt2) >> 16
239 pkhbt r11, r1, r6, lsl #16 @ i4 | i0 = t0/t1 first half
240 pkhbt r7, r7, r9, lsl #16 @ 5c | 1c
241 pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = temp1 = t2 first half
242 pkhtb r1, r6, r1, asr #16 @ i5 | i1
243 uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = temp2 (d) = t3 first half
244 pkhbt r9, r14, r12, lsl #16 @ i6 | i2 = t0/t1 second half
245 uadd16 r10, r11, r9 @ a = t0
246 usub16 r9, r11, r9 @ b = t1
247 pkhtb r6, r12, r14, asr #16 @ i7 | i3
248 subs r5, r5, #0x1 @ i--
249 smulwt r7, r3, r6 @ (ip[7] * cospi8sqrt2minus1) >> 16
250 smulwt r11, r4, r6 @ (ip[7] * sinpi8sqrt2) >> 16
251 smulwb r12, r3, r6 @ (ip[3] * cospi8sqrt2minus1) >> 16
252 smulwb r14, r4, r6 @ (ip[3] * sinpi8sqrt2) >> 16
254 pkhbt r7, r12, r7, lsl #16 @ 7c | 3c
255 pkhbt r11, r14, r11, lsl #16 @ 7s | 3s = temp1 (d) = t3 second half
256 mov r14, #0x4 @ set up 4's
257 orr r14, r14, #0x40000 @ 4|4
258 uadd16 r6, r7, r6 @ 7c+7 | 3c+3 = temp2 (c) = t2 second half
259 usub16 r12, r8, r6 @ c (o5 | o1) = t2
260 uadd16 r6, r11, r1 @ d (o7 | o3) = t3
261 uadd16 r10, r10, r14 @ t0 + 4
262 uadd16 r9, r9, r14 @ t1 + 4
263 uadd16 r7, r10, r6 @ a+d = dst{0,1}[0]
264 usub16 r6, r10, r6 @ a-d = dst{0,1}[3]
265 uadd16 r10, r9, r12 @ b+c = dst{0,1}[1]
266 usub16 r1, r9, r12 @ b-c = dst{0,1}[2]
268 mov r9, r6, asr #3 @ o[1][3]
269 mov r12, r1, asr #3 @ o[1][2]
270 pkhtb r8, r12, r7, asr #19 @ o[1][0,2]
271 pkhtb r11, r9, r10, asr #19 @ o[1][1,3]
279 asr r10, #3 @ o[0][1]
280 pkhbt r7, r7, r1, lsl #13 @ o[0][0,2]
281 pkhbt r10, r10, r6, lsl #13 @ o[0][1,3]
284 uxtab16 r10, r10, r12, ror #8
286 uxtab16 r11, r11, r9, ror #8
291 orr r7, r7, r10, lsl #8
292 orr r8, r8, r11, lsl #8
294 str_post r7, r0, r2, lsl #1
301 @ void vp8_idct_dc_add(uint8_t *dst, DCTELEM block[16], int stride)
302 function ff_vp8_idct_dc_add_armv6, export=1
310 ldr_post r5, r0, r2, lsl #1
311 pkhbt r3, r3, r3, lsl #16
313 uxtab16 lr, r3, r5 @ a1+2 | a1+0
314 uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1
316 uxtab16 r4, r3, r4, ror #8
321 orr lr, lr, r5, lsl #8
322 orr r12, r12, r4, lsl #8
325 sub r0, r0, r2, lsl #1
327 str_post lr, r0, r2, lsl #1
330 uxtab16 r5, r3, r5, ror #8
332 uxtab16 r4, r3, r4, ror #8
337 orr lr, lr, r5, lsl #8
338 orr r12, r12, r4, lsl #8
341 str_post lr, r0, r2, lsl #1
346 @ void vp8_idct_dc_add4uv(uint8_t *dst, DCTELEM block[4][16], int stride)
347 function ff_vp8_idct_dc_add4uv_armv6, export=1
350 bl ff_vp8_idct_dc_add_armv6
351 sub r0, r0, r2, lsl #2
353 bl ff_vp8_idct_dc_add_armv6
355 bl ff_vp8_idct_dc_add_armv6
356 sub r0, r0, r2, lsl #2
358 bl ff_vp8_idct_dc_add_armv6
363 @ void vp8_idct_dc_add4y(uint8_t *dst, DCTELEM block[4][16], int stride)
364 function ff_vp8_idct_dc_add4y_armv6, export=1
367 bl ff_vp8_idct_dc_add_armv6
368 sub r0, r0, r2, lsl #2
370 bl ff_vp8_idct_dc_add_armv6
371 sub r0, r0, r2, lsl #2
373 bl ff_vp8_idct_dc_add_armv6
374 sub r0, r0, r2, lsl #2
376 bl ff_vp8_idct_dc_add_armv6
383 @ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
384 function ff_vp8_v_loop_filter16_simple_armv6, export=1
387 ldr_dpren r3, r0, r1, lsl #1 @ p1
388 ldr_dpren r4, r0, r1 @ p0
390 ldr r6, [r0, r1] @ q1
391 orr r2, r2, r2, lsl #16
393 mov lr, #0 @ need 0 in a couple places
394 orr r12, r2, r2, lsl #8 @ splat int -> byte
398 @ vp8_simple_filter_mask()
399 uqsub8 r7, r3, r6 @ p1 - q1
400 uqsub8 r8, r6, r3 @ q1 - p1
401 uqsub8 r10, r4, r5 @ p0 - q0
402 uqsub8 r11, r5, r4 @ q0 - p0
403 orr r8, r8, r7 @ abs(p1 - q1)
404 orr r10, r10, r11 @ abs(p0 - q0)
405 uqadd8 r10, r10, r10 @ abs(p0 - q0) * 2
406 uhadd8 r8, r8, lr @ abs(p1 - q2) >> 1
407 uqadd8 r10, r10, r8 @ abs(p0 - q0)*2 + abs(p1 - q1)/2
409 usub8 r10, r12, r10 @ compare to flimit. usub8 sets GE flags
410 sel r10, r8, lr @ filter mask: F or 0
412 beq 2f @ skip filtering if all masks are 0x00
414 @ vp8_simple_filter()
415 eor r3, r3, r2 @ p1 offset to convert to a signed value
416 eor r6, r6, r2 @ q1 offset to convert to a signed value
417 eor r4, r4, r2 @ p0 offset to convert to a signed value
418 eor r5, r5, r2 @ q0 offset to convert to a signed value
420 qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
421 qsub8 r6, r5, r4 @ q0 - p0
422 qadd8 r3, r3, r6 @ += q0 - p0
424 qadd8 r3, r3, r6 @ += q0 - p0
426 qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0))
428 and r3, r3, r10 @ vp8_filter &= mask
430 qadd8 r7, r3, r7 @ Filter1 = vp8_filter + 4
431 qadd8 r8, r3, r8 @ Filter2 = vp8_filter + 3
437 shadd8 r7, r7, lr @ Filter1 >>= 3
438 shadd8 r8, r8, lr @ Filter2 >>= 3
440 qsub8 r5, r5, r7 @ u = q0 - Filter1
441 qadd8 r4, r4, r8 @ u = p0 + Filter2
442 eor r5, r5, r2 @ *oq0 = u^0x80
443 eor r4, r4, r2 @ *op0 = u^0x80
445 str r5, [r0] @ store oq0 result
446 A str r4, [r0, -r1] @ store op0 result
450 subs r9, r9, #1 @ counter--
451 add r0, r0, #4 @ next row
453 A ldrne r3, [r0, -r1, lsl #1] @ p1
454 T subne r3, r0, r1, lsl #1
455 T ldrne r3, [r3] @ p1
456 A ldrne r4, [r0, -r1] @ p0
458 T ldrne r4, [r4] @ p0
461 ldrne r6, [r0, r1] @ q1
468 c0x01010101: .long 0x01010101
469 c0x03030303: .long 0x03030303
470 c0x04040404: .long 0x04040404
471 c0x7F7F7F7F: .long 0x7F7F7F7F
472 c0x80808080: .long 0x80808080
474 @ void vp8_v_loop_filter16_inner(uint8_t *dst, int stride,
475 @ int fE, int fI, int hev_thresh)
477 @ void vp8_v_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
478 @ int fE, int fI, int hev_thresh)
480 @ void vp8_v_loop_filter_inner(uint8_t *dst, int stride,
481 @ int fE, int fI, int hev_thresh, int count)
482 function ff_vp8_v_loop_filter_inner_armv6, export=1
485 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
486 ldr r5, [sp, #40] @ counter
487 ldr r6, [sp, #36] @ load thresh address
488 sub sp, sp, #16 @ create temp buffer
490 ldr r10,[r0, r1] @ p2
491 ldr_post r9, r0, r1, lsl #1 @ p3
492 ldr r12,[r0, r1] @ p0
493 ldr_post r11, r0, r1, lsl #1 @ p1
495 orr r2, r2, r2, lsl #16
496 orr r3, r3, r3, lsl #16
497 orr r6, r6, r6, lsl #16
498 orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
499 orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
500 orr r3, r6, r6, lsl #8 @ thresh splat int -> byte
503 @ vp8_filter_mask() function
504 @ calculate breakout conditions
505 uqsub8 r6, r9, r10 @ p3 - p2
506 uqsub8 r7, r10, r9 @ p2 - p3
507 uqsub8 r8, r10, r11 @ p2 - p1
508 uqsub8 r10, r11, r10 @ p1 - p2
510 orr r6, r6, r7 @ abs (p3-p2)
511 orr r8, r8, r10 @ abs (p2-p1)
512 uqsub8 lr, r6, r2 @ compare to limit. lr: vp8_filter_mask
513 uqsub8 r8, r8, r2 @ compare to limit
514 uqsub8 r6, r11, r12 @ p1 - p0
516 uqsub8 r7, r12, r11 @ p0 - p1
517 ldr r10,[r0, r1] @ q1
518 ldr_post r9, r0, r1, lsl #1 @ q0
519 orr r6, r6, r7 @ abs (p1-p0)
520 uqsub8 r7, r6, r2 @ compare to limit
521 uqsub8 r8, r6, r3 @ compare to thresh -- save r8 for later
524 uqsub8 r6, r11, r10 @ p1 - q1
525 uqsub8 r7, r10, r11 @ q1 - p1
526 uqsub8 r11, r12, r9 @ p0 - q0
527 uqsub8 r12, r9, r12 @ q0 - p0
528 orr r6, r6, r7 @ abs (p1-q1)
530 orr r12, r11, r12 @ abs (p0-q0)
531 ldr_post r11, r0, r1 @ q2
532 uqadd8 r12, r12, r12 @ abs (p0-q0) * 2
533 and r6, r7, r6, lsr #1 @ abs (p1-q1) / 2
534 uqsub8 r7, r9, r10 @ q0 - q1
535 uqadd8 r12, r12, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
536 uqsub8 r6, r10, r9 @ q1 - q0
537 uqsub8 r12, r12, r4 @ compare to flimit
538 uqsub8 r9, r11, r10 @ q2 - q1
542 ldr_post r12, r0, r1 @ q3
543 uqsub8 r10, r10, r11 @ q1 - q2
544 orr r6, r7, r6 @ abs (q1-q0)
545 orr r10, r9, r10 @ abs (q2-q1)
546 uqsub8 r7, r6, r2 @ compare to limit
547 uqsub8 r10, r10, r2 @ compare to limit
548 uqsub8 r6, r6, r3 @ compare to thresh -- save r6 for later
552 uqsub8 r10, r12, r11 @ q3 - q2
553 uqsub8 r9, r11, r12 @ q2 - q3
555 mvn r11, #0 @ r11 == -1
557 orr r10, r10, r9 @ abs (q3-q2)
558 uqsub8 r10, r10, r2 @ compare to limit
562 sub r0, r0, r1, lsl #2
564 usub8 lr, r12, lr @ use usub8 instead of ssub8
565 sel lr, r11, r12 @ filter mask: lr
568 beq 2f @ skip filtering
570 sub r0, r0, r1, lsl #1 @ move r0 pointer down by 6 lines
572 @vp8_hevmask() function
573 @calculate high edge variance
574 orr r10, r6, r8 @ calculate vp8_hevmask
576 usub8 r10, r12, r10 @ use usub8 instead of ssub8
577 sel r6, r12, r11 @ obtain vp8_hevmask: r6
579 @vp8_filter() function
580 ldr r8, [r0, r1] @ p0
581 ldr_post r7, r0, r1, lsl #1 @ p1
583 ldr r10,[r0, r1] @ q1
584 ldr_post r9, r0, r1, lsl #1 @ q0
586 eor r7, r7, r12 @ p1 offset to convert to a signed value
587 eor r8, r8, r12 @ p0 offset to convert to a signed value
588 eor r9, r9, r12 @ q0 offset to convert to a signed value
589 eor r10, r10, r12 @ q1 offset to convert to a signed value
591 str r9, [sp] @ store qs0 temporarily
592 str r8, [sp, #4] @ store ps0 temporarily
593 str r10,[sp, #8] @ store qs1 temporarily
594 str r7, [sp, #12] @ store ps1 temporarily
596 qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1)
597 qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
599 and r7, r7, r6 @ vp8_filter (r7) &= hev
602 ldr r9, c0x03030303 @ r9 = 3 --modified for vp8
608 and r7, r7, lr @ vp8_filter &= mask@
610 qadd8 r8, r7, r9 @ Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
611 qadd8 r7, r7, r10 @ vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
614 shadd8 r8, r8, r9 @ Filter2 >>= 3
615 shadd8 r7, r7, r9 @ vp8_filter >>= 3
618 shadd8 lr, r8, r9 @ lr: Filter2
619 shadd8 r7, r7, r9 @ r7: filter
623 ldr r8, [sp] @ load qs0
624 ldr r9, [sp, #4] @ load ps0
628 qsub8 r8, r8, r7 @ u = vp8_signed_char_clamp(qs0 - vp8_filter)
629 qadd8 r9, r9, lr @ u = vp8_signed_char_clamp(ps0 + Filter2)
632 sadd8 r7, r7, r10 @ vp8_filter += 1
633 shadd8 r7, r7, lr @ vp8_filter >>= 1
635 ldr r11,[sp, #12] @ load ps1
636 ldr r10,[sp, #8] @ load qs1
638 bic r7, r7, r6 @ vp8_filter &= ~hev
639 sub r0, r0, r1, lsl #2
641 qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
642 qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
644 eor r11, r11, r12 @ *op1 = u^0x80
645 eor r9, r9, r12 @ *op0 = u^0x80
646 eor r8, r8, r12 @ *oq0 = u^0x80
647 eor r10, r10, r12 @ *oq1 = u^0x80
648 str r9, [r0, r1] @ store op0 result
649 str_post r11, r0, r1, lsl #1 @ store op1
650 str r10,[r0, r1] @ store oq1
651 str_post r8, r0, r1, lsl #1 @ store oq0 result
653 sub r0, r0, r1, lsl #1
657 sub r0, r0, r1, lsl #2
661 ldrne r10,[r0, r1] @ p2
662 A ldrne r9, [r0], r1, lsl #1 @ p3
663 T ldrne r9, [r0] @ p3
664 T addne r0, r0, r1, lsl #1
666 ldrne r12,[r0, r1] @ p0
667 A ldrne r11,[r0], r1, lsl #1 @ p1
668 T ldrne r11,[r0] @ p3
669 T addne r0, r0, r1, lsl #1
677 @ void vp8_v_loop_filter16(uint8_t *dst, int stride,
678 @ int fE, int fI, int hev_thresh)
680 @ void vp8_v_loop_filter8uv(uint8_t *dstU, uint8_t *dstV, int stride,
681 @ int fE, int fI, int hev_thresh)
683 @ void vp8_v_loop_filter(uint8_t *dst, int stride,
684 @ int fE, int fI, int hev_thresh, int count)
685 function ff_vp8_v_loop_filter_armv6, export=1
688 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
689 ldr r5, [sp, #40] @ counter
690 ldr r6, [sp, #36] @ load thresh address
691 sub sp, sp, #16 @ create temp buffer
693 ldr r10,[r0, r1] @ p2
694 ldr_post r9, r0, r1, lsl #1 @ p3
695 ldr r12,[r0, r1] @ p0
696 ldr_post r11, r0, r1, lsl #1 @ p1
698 orr r2, r2, r2, lsl #16
699 orr r3, r3, r3, lsl #16
700 orr r6, r6, r6, lsl #16
701 orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
702 orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
703 orr r3, r6, r6, lsl #8 @ thresh splat int -> byte
706 @ vp8_filter_mask() function
707 @ calculate breakout conditions
708 uqsub8 r6, r9, r10 @ p3 - p2
709 uqsub8 r7, r10, r9 @ p2 - p3
710 uqsub8 r8, r10, r11 @ p2 - p1
711 uqsub8 r10, r11, r10 @ p1 - p2
713 orr r6, r6, r7 @ abs (p3-p2)
714 orr r8, r8, r10 @ abs (p2-p1)
715 uqsub8 lr, r6, r2 @ compare to limit. lr: vp8_filter_mask
716 uqsub8 r8, r8, r2 @ compare to limit
718 uqsub8 r6, r11, r12 @ p1 - p0
720 uqsub8 r7, r12, r11 @ p0 - p1
721 ldr r10,[r0, r1] @ q1
722 ldr_post r9, r0, r1, lsl #1 @ q0
723 orr r6, r6, r7 @ abs (p1-p0)
724 uqsub8 r7, r6, r2 @ compare to limit
725 uqsub8 r8, r6, r3 @ compare to thresh -- save r8 for later
728 uqsub8 r6, r11, r10 @ p1 - q1
729 uqsub8 r7, r10, r11 @ q1 - p1
730 uqsub8 r11, r12, r9 @ p0 - q0
731 uqsub8 r12, r9, r12 @ q0 - p0
732 orr r6, r6, r7 @ abs (p1-q1)
734 orr r12, r11, r12 @ abs (p0-q0)
735 ldr_post r11, r0, r1 @ q2
736 uqadd8 r12, r12, r12 @ abs (p0-q0) * 2
737 and r6, r7, r6, lsr #1 @ abs (p1-q1) / 2
738 uqsub8 r7, r9, r10 @ q0 - q1
739 uqadd8 r12, r12, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
740 uqsub8 r6, r10, r9 @ q1 - q0
741 uqsub8 r12, r12, r4 @ compare to flimit
742 uqsub8 r9, r11, r10 @ q2 - q1
746 ldr_post r12, r0, r1 @ q3
748 uqsub8 r10, r10, r11 @ q1 - q2
749 orr r6, r7, r6 @ abs (q1-q0)
750 orr r10, r9, r10 @ abs (q2-q1)
751 uqsub8 r7, r6, r2 @ compare to limit
752 uqsub8 r10, r10, r2 @ compare to limit
753 uqsub8 r6, r6, r3 @ compare to thresh -- save r6 for later
757 uqsub8 r10, r12, r11 @ q3 - q2
758 uqsub8 r9, r11, r12 @ q2 - q3
760 mvn r11, #0 @ r11 == -1
762 orr r10, r10, r9 @ abs (q3-q2)
763 uqsub8 r10, r10, r2 @ compare to limit
769 usub8 lr, r12, lr @ use usub8 instead of ssub8
770 sel lr, r11, r12 @ filter mask: lr
773 beq 2f @ skip filtering
775 @vp8_hevmask() function
776 @calculate high edge variance
777 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 6 lines
778 sub r0, r0, r1, lsl #1
783 sel r6, r12, r11 @ hev mask: r6
785 @vp8_mbfilter() function
786 @p2, q2 are only needed at the end. Do not need to load them in now.
787 ldr r8, [r0, r1] @ p0
788 ldr_post r7, r0, r1, lsl #1 @ p1
790 ldr_post r9, r0, r1 @ q0
793 eor r7, r7, r12 @ ps1
794 eor r8, r8, r12 @ ps0
795 eor r9, r9, r12 @ qs0
796 eor r10, r10, r12 @ qs1
798 qsub8 r12, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
799 str r7, [sp, #12] @ store ps1 temporarily
800 qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1)
801 str r10,[sp, #8] @ store qs1 temporarily
803 str r9, [sp] @ store qs0 temporarily
805 str r8, [sp, #4] @ store ps0 temporarily
806 qadd8 r7, r7, r12 @ vp8_filter: r7
808 ldr r10, c0x03030303 @ r10 = 3 --modified for vp8
811 and r7, r7, lr @ vp8_filter &= mask (lr is free)
813 mov r12, r7 @ Filter2: r12
814 and r12, r12, r6 @ Filter2 &= hev
816 @save bottom 3 bits so that we round one side +4 and the other +3
817 qadd8 r8, r12, r9 @ Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
818 qadd8 r12, r12, r10 @ Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
821 shadd8 r8, r8, r10 @ Filter1 >>= 3
822 shadd8 r12, r12, r10 @ Filter2 >>= 3
825 shadd8 r8, r8, r10 @ r8: Filter1
826 shadd8 r12, r12, r10 @ r12: Filter2
828 ldr r9, [sp] @ load qs0
829 ldr r11,[sp, #4] @ load ps0
831 qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(qs0 - Filter1)
832 qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(ps0 + Filter2)
834 bic r12, r7, r6 @ vp8_filter &= ~hev ( r6 is free)
836 @roughly 3/7th difference across boundary
841 sxtb16 r10, r12, ror #8
842 smlabb r8, r6, lr, r7
843 smlatb r6, r6, lr, r7
844 smlabb r7, r10, lr, r7
846 ssat r8, #8, r8, asr #7
847 ssat r6, #8, r6, asr #7
849 ssat r7, #8, r7, asr #7
850 ssat r10, #8, r10, asr #7
854 pkhbt r6, r8, r6, lsl #16
855 pkhbt r10, r7, r10, lsl #16
861 orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
863 qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
864 qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
865 eor r8, r8, lr @ *oq0 = s^0x80
866 str r8, [r0] @ store *oq0
868 eor r10, r10, lr @ *op0 = s^0x80
869 str r10,[r0] @ store *op0
871 @roughly 2/7th difference across boundary
876 sxtb16 r10, r12, ror #8
877 smlabb r8, r6, lr, r7
878 smlatb r6, r6, lr, r7
879 smlabb r9, r10, lr, r7
880 smlatb r10, r10, lr, r7
881 ssat r8, #8, r8, asr #7
882 ssat r6, #8, r6, asr #7
883 ssat r9, #8, r9, asr #7
884 ssat r10, #8, r10, asr #7
888 pkhbt r6, r8, r6, lsl #16
889 pkhbt r10, r9, r10, lsl #16
891 ldr r9, [sp, #8] @ load qs1
892 ldr r11, [sp, #12] @ load ps1
899 orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
901 qadd8 r11, r11, r10 @ s = vp8_signed_char_clamp(ps1 + u)
902 qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1 - u)
903 eor r11, r11, lr @ *op1 = s^0x80
904 str_post r11, r0, r1 @ store *op1
905 eor r8, r8, lr @ *oq1 = s^0x80
906 add r0, r0, r1, lsl #1
910 str_post r8, r0, r1 @ store *oq1
912 @roughly 1/7th difference across boundary
914 ldr r9, [r0] @ load q2
917 sxtb16 r10, r12, ror #8
918 smlabb r8, r6, lr, r7
919 smlatb r6, r6, lr, r7
920 smlabb r12, r10, lr, r7
921 smlatb r10, r10, lr, r7
922 ssat r8, #8, r8, asr #7
923 ssat r6, #8, r6, asr #7
924 ssat r12, #8, r12, asr #7
925 ssat r10, #8, r10, asr #7
927 sub r0, r0, r1, lsl #2
929 pkhbt r6, r8, r6, lsl #16
930 pkhbt r10, r12, r10, lsl #16
935 ldr r11, [r0] @ load p2
943 orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
945 qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2 + u)
946 qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2 - u)
947 eor r8, r8, lr @ *op2 = s^0x80
948 str_post r8, r0, r1, lsl #2 @ store *op2
950 eor r10, r10, lr @ *oq2 = s^0x80
951 str_post r10, r0, r1, lsl #1 @ store *oq2
955 sub r0, r0, r1, lsl #3
959 ldrne r10,[r0, r1] @ p2
960 A ldrne r9, [r0], r1, lsl #1 @ p3
961 T ldrne r9, [r0] @ p3
962 T addne r0, r0, r1, lsl #1
964 ldrne r12,[r0, r1] @ p0
965 A ldrne r11,[r0], r1, lsl #1 @ p1
966 T ldrne r11,[r0] @ p3
967 T addne r0, r0, r1, lsl #1
975 .macro TRANSPOSE_MATRIX i0, i1, i2, i3, o3, o2, o1, o0
976 @ input: $0, $1, $2, $3
977 @ output: $4, $5, $6, $7
984 uxtb16 \o1, \i1 @ xx 12 xx 10
985 uxtb16 \o0, \i0 @ xx 02 xx 00
986 uxtb16 \o3, \i3 @ xx 32 xx 30
987 uxtb16 \o2, \i2 @ xx 22 xx 20
988 orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00
989 orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20
991 uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11
992 uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31
993 uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01
994 uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21
995 orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01
996 orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21
998 pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 -- p1
999 pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 -- p3
1001 pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 -- p0
1002 pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 -- p2
1005 @ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim)
1006 function ff_vp8_h_loop_filter16_simple_armv6, export=1
1008 orr r12, r2, r2, lsl #16
1010 orr r12, r12, r12, lsl #8
1012 @ load soure data to r7, r8, r9, r10
1015 ldr_post r7, r0, r1, lsl #1
1017 ldr_post r9, r0, r1, lsl #1
1020 mov r11, #4 @ count (r11) for 4-in-parallel
1022 @transpose r7, r8, r9, r10 to r3, r4, r5, r6
1023 TRANSPOSE_MATRIX r7, r8, r9, r10, r6, r5, r4, r3
1025 @ vp8_simple_filter_mask() function
1026 uqsub8 r7, r3, r6 @ p1 - q1
1027 uqsub8 r8, r6, r3 @ q1 - p1
1028 uqsub8 r9, r4, r5 @ p0 - q0
1029 uqsub8 r10, r5, r4 @ q0 - p0
1030 orr r7, r7, r8 @ abs(p1 - q1)
1031 orr r9, r9, r10 @ abs(p0 - q0)
1033 uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2
1034 uhadd8 r7, r7, r8 @ abs(p1 - q1) / 2
1035 uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1 - q1)/2
1036 mvn r10, #0 @ r10 == -1
1038 usub8 r7, r12, r7 @ compare to flimit
1039 sel lr, r10, r8 @ filter mask
1042 beq 2f @ skip filtering
1044 @vp8_simple_filter() function
1045 eor r3, r3, r2 @ p1 offset to convert to a signed value
1046 eor r6, r6, r2 @ q1 offset to convert to a signed value
1047 eor r4, r4, r2 @ p0 offset to convert to a signed value
1048 eor r5, r5, r2 @ q0 offset to convert to a signed value
1050 qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
1051 qsub8 r6, r5, r4 @ q0 - p0
1053 qadd8 r3, r3, r6 @ vp8_filter += q0 - p0
1054 ldr r9, c0x03030303 @ r9 = 3
1056 qadd8 r3, r3, r6 @ vp8_filter += q0 - p0
1059 qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0))
1061 and r3, r3, lr @ vp8_filter &= mask
1063 qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3
1064 qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4
1070 shadd8 r9, r9, r8 @ Filter2 >>= 3
1071 shadd8 r3, r3, r8 @ Filter1 >>= 3
1074 sub r0, r0, r1, lsl #2
1076 qadd8 r4, r4, r9 @ u = p0 + Filter2
1077 qsub8 r5, r5, r3 @ u = q0 - Filter1
1078 eor r4, r4, r2 @ *op0 = u^0x80
1079 eor r5, r5, r2 @ *oq0 = u^0x80
1081 strb r4, [r0, #-1] @ store the result
1083 strb_post r5, r0, r1
1088 strb_post r5, r0, r1
1093 strb_post r5, r0, r1
1097 strb_post r5, r0, r1
1102 @ load soure data to r7, r8, r9, r10
1106 A ldrne r7, [r0], r1, lsl #1
1108 T addne r0, r0, r1, lsl #1
1111 A ldrne r9, [r0], r1, lsl #1
1113 T addne r0, r0, r1, lsl #1
1121 @ void vp8_h_loop_filter16_inner(uint8_t *dst, int stride,
1122 @ int fE, int fI, int hev_thresh)
1124 @ void vp8_h_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
1125 @ int fE, int fI, int hev_thresh)
1127 @ void vp8_h_loop_filter_inner(uint8_t *dst, int stride,
1128 @ int fE, int fI, int hev_thresh, int count)
1129 function ff_vp8_h_loop_filter_inner_armv6, export=1
1132 sub r0, r0, #4 @ move r0 pointer down by 4
1133 ldr r5, [sp, #40] @ counter
1134 ldr r9, [sp, #36] @ load thresh address
1135 sub sp, sp, #16 @ create temp buffer
1137 ldr r7, [r0, r1] @ transpose will make it into p3-p0
1138 ldr_post r6, r0, r1, lsl #1 @ load source data
1140 ldr_post r8, r0, r1, lsl #1
1142 orr r2, r2, r2, lsl #16
1143 orr r3, r3, r3, lsl #16
1144 orr r9, r9, r9, lsl #16
1145 orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
1146 orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
1147 orr r3, r9, r9, lsl #8 @ thresh splat int -> byte
1150 @ vp8_filter_mask() function
1151 @ calculate breakout conditions
1152 @ transpose the source data for 4-in-parallel operation
1153 TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1155 uqsub8 r7, r9, r10 @ p3 - p2
1156 uqsub8 r8, r10, r9 @ p2 - p3
1157 uqsub8 r9, r10, r11 @ p2 - p1
1158 uqsub8 r10, r11, r10 @ p1 - p2
1159 orr r7, r7, r8 @ abs (p3-p2)
1160 orr r10, r9, r10 @ abs (p2-p1)
1161 uqsub8 lr, r7, r2 @ compare to limit. lr: vp8_filter_mask
1162 uqsub8 r10, r10, r2 @ compare to limit
1164 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
1168 uqsub8 r6, r11, r12 @ p1 - p0
1169 uqsub8 r7, r12, r11 @ p0 - p1
1170 add r0, r0, #4 @ move r0 pointer up by 4
1171 orr r6, r6, r7 @ abs (p1-p0)
1172 str r11,[sp, #12] @ save p1
1173 uqsub8 r10, r6, r2 @ compare to limit
1174 uqsub8 r11, r6, r3 @ compare to thresh
1177 @ transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
1178 @ transpose the source data for 4-in-parallel operation
1179 str r11,[sp] @ push r11 to stack
1181 ldr_post r6, r0, r1, lsl #1 @ load source data
1182 str r12,[sp, #4] @ save current reg before load q0 - q3 data
1185 ldr_post r8, r0, r1, lsl #1
1187 TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1189 ldr lr, [sp, #8] @ load back (f)limit accumulator
1191 uqsub8 r6, r12, r11 @ q3 - q2
1192 uqsub8 r7, r11, r12 @ q2 - q3
1193 uqsub8 r12, r11, r10 @ q2 - q1
1194 uqsub8 r11, r10, r11 @ q1 - q2
1195 orr r6, r6, r7 @ abs (q3-q2)
1196 orr r7, r12, r11 @ abs (q2-q1)
1197 uqsub8 r6, r6, r2 @ compare to limit
1198 uqsub8 r7, r7, r2 @ compare to limit
1199 ldr r11,[sp, #4] @ load back p0
1200 ldr r12,[sp, #12] @ load back p1
1204 uqsub8 r6, r11, r9 @ p0 - q0
1205 uqsub8 r7, r9, r11 @ q0 - p0
1206 uqsub8 r8, r12, r10 @ p1 - q1
1207 uqsub8 r11, r10, r12 @ q1 - p1
1208 orr r6, r6, r7 @ abs (p0-q0)
1210 orr r8, r8, r11 @ abs (p1-q1)
1211 uqadd8 r6, r6, r6 @ abs (p0-q0) * 2
1212 and r8, r7, r8, lsr #1 @ abs (p1-q1) / 2
1213 uqsub8 r11, r10, r9 @ q1 - q0
1214 uqadd8 r6, r8, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
1215 uqsub8 r12, r9, r10 @ q0 - q1
1216 uqsub8 r6, r6, r4 @ compare to flimit
1218 orr r9, r11, r12 @ abs (q1-q0)
1219 uqsub8 r8, r9, r2 @ compare to limit
1220 uqsub8 r10, r9, r3 @ compare to thresh
1224 mvn r11, #0 @ r11 == -1
1228 ldr r9, [sp] @ load the compared result
1229 sel lr, r11, r12 @ filter mask: lr
1232 beq 2f @ skip filtering
1234 @vp8_hevmask() function
1235 @calculate high edge variance
1236 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
1241 ldrh_post r8, r0, r1
1244 sel r6, r12, r11 @ hev mask: r6
1246 @vp8_filter() function
1247 @ load soure data to r6, r11, r12, lr
1249 ldrh_post r10, r0, r1
1251 pkhbt r12, r7, r8, lsl #16
1254 ldrh_post r8, r0, r1
1256 pkhbt r11, r9, r10, lsl #16
1259 ldrh_post r10, r0, r1
1261 @ Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
1265 pkhbt r6, r7, r8, lsl #16
1266 pkhbt lr, r9, r10, lsl #16
1268 @transpose r12, r11, r6, lr to r7, r8, r9, r10
1269 TRANSPOSE_MATRIX r12, r11, r6, lr, r10, r9, r8, r7
1271 @load back hev_mask r6 and filter_mask lr
1272 ldr r12, c0x80808080
1276 eor r7, r7, r12 @ p1 offset to convert to a signed value
1277 eor r8, r8, r12 @ p0 offset to convert to a signed value
1278 eor r9, r9, r12 @ q0 offset to convert to a signed value
1279 eor r10, r10, r12 @ q1 offset to convert to a signed value
1281 str r9, [sp] @ store qs0 temporarily
1282 str r8, [sp, #4] @ store ps0 temporarily
1283 str r10,[sp, #8] @ store qs1 temporarily
1284 str r7, [sp, #12] @ store ps1 temporarily
1286 qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1)
1287 qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
1289 and r7, r7, r6 @ vp8_filter (r7) &= hev (r7 : filter)
1292 ldr r9, c0x03030303 @ r9 = 3 --modified for vp8
1295 ldr r10, c0x04040404
1299 and r7, r7, lr @ vp8_filter &= mask
1301 qadd8 r8, r7, r9 @ Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
1302 qadd8 r7, r7, r10 @ vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
1305 shadd8 r8, r8, r9 @ Filter2 >>= 3
1306 shadd8 r7, r7, r9 @ vp8_filter >>= 3
1309 shadd8 lr, r8, r9 @ lr: filter2
1310 shadd8 r7, r7, r9 @ r7: filter
1313 ldr r8, [sp] @ load qs0
1314 ldr r9, [sp, #4] @ load ps0
1316 ldr r10, c0x01010101
1318 qsub8 r8, r8, r7 @ u = vp8_signed_char_clamp(qs0 - vp8_filter)
1319 qadd8 r9, r9, lr @ u = vp8_signed_char_clamp(ps0 + Filter2)
1329 ldr r10,[sp, #8] @ load qs1
1330 ldr r11,[sp, #12] @ load ps1
1332 bic r7, r7, r6 @ r7: vp8_filter
1334 qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
1335 qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
1339 sub r0, r0, r1, lsl #2
1341 @we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
1342 TRANSPOSE_MATRIX r11, r9, r8, r10, lr, r12, r7, r6
1344 strh r6, [r0, #-2] @ store the result
1346 strh_post r6, r0, r1
1350 strh_post r7, r0, r1
1353 mov r12, r12, lsr #16
1354 strh_post r12, r0, r1
1358 strh_post lr, r0, r1
1366 A ldrne r6, [r0], r1, lsl #1 @ load source data
1367 T ldrne r6, [r0] @ load source data
1368 T addne r0, r0, r1, lsl #1
1371 A ldrne r8, [r0], r1, lsl #1
1373 T addne r0, r0, r1, lsl #1
1381 @ void vp8_h_loop_filter16(uint8_t *dst, int stride,
1382 @ int fE, int fI, int hev_thresh)
1384 @ void vp8_h_loop_filter8uv(uint8_t *dstU, uint8_t *dstV, int stride,
1385 @ int fE, int fI, int hev_thresh)
1387 @ void vp8_h_loop_filter(uint8_t *dst, int stride,
1388 @ int fE, int fI, int hev_thresh, int count)
1389 function ff_vp8_h_loop_filter_armv6, export=1
1392 sub r0, r0, #4 @ move r0 pointer down by 4
1393 ldr r5, [sp, #40] @ counter
1394 ldr r9, [sp, #36] @ load thresh address
1395 sub sp, sp, #16 @ create temp buffer
1397 ldr r7, [r0, r1] @ transpose will make it into p3-p0
1398 ldr_post r6, r0, r1, lsl #1 @ load source data
1400 ldr_post r8, r0, r1, lsl #1
1402 orr r2, r2, r2, lsl #16
1403 orr r3, r3, r3, lsl #16
1404 orr r9, r9, r9, lsl #16
1405 orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
1406 orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
1407 orr r3, r9, r9, lsl #8 @ thresh splat int -> byte
1410 @ vp8_filter_mask() function
1411 @ calculate breakout conditions
1412 @ transpose the source data for 4-in-parallel operation
1413 TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1415 uqsub8 r7, r9, r10 @ p3 - p2
1416 uqsub8 r8, r10, r9 @ p2 - p3
1417 uqsub8 r9, r10, r11 @ p2 - p1
1418 uqsub8 r10, r11, r10 @ p1 - p2
1419 orr r7, r7, r8 @ abs (p3-p2)
1420 orr r10, r9, r10 @ abs (p2-p1)
1421 uqsub8 lr, r7, r2 @ compare to limit. lr: vp8_filter_mask
1422 uqsub8 r10, r10, r2 @ compare to limit
1424 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
1428 uqsub8 r6, r11, r12 @ p1 - p0
1429 uqsub8 r7, r12, r11 @ p0 - p1
1430 add r0, r0, #4 @ move r0 pointer up by 4
1431 orr r6, r6, r7 @ abs (p1-p0)
1432 str r11,[sp, #12] @ save p1
1433 uqsub8 r10, r6, r2 @ compare to limit
1434 uqsub8 r11, r6, r3 @ compare to thresh
1437 @ transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
1438 @ transpose the source data for 4-in-parallel operation
1439 str r11,[sp] @ push r11 to stack
1441 ldr_post r6, r0, r1, lsl #1 @ load source data
1442 str r12,[sp, #4] @ save current reg before load q0 - q3 data
1445 ldr_post r8, r0, r1, lsl #1
1447 TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1449 ldr lr, [sp, #8] @ load back (f)limit accumulator
1451 uqsub8 r6, r12, r11 @ q3 - q2
1452 uqsub8 r7, r11, r12 @ q2 - q3
1453 uqsub8 r12, r11, r10 @ q2 - q1
1454 uqsub8 r11, r10, r11 @ q1 - q2
1455 orr r6, r6, r7 @ abs (q3-q2)
1456 orr r7, r12, r11 @ abs (q2-q1)
1457 uqsub8 r6, r6, r2 @ compare to limit
1458 uqsub8 r7, r7, r2 @ compare to limit
1459 ldr r11,[sp, #4] @ load back p0
1460 ldr r12,[sp, #12] @ load back p1
1464 uqsub8 r6, r11, r9 @ p0 - q0
1465 uqsub8 r7, r9, r11 @ q0 - p0
1466 uqsub8 r8, r12, r10 @ p1 - q1
1467 uqsub8 r11, r10, r12 @ q1 - p1
1468 orr r6, r6, r7 @ abs (p0-q0)
1470 orr r8, r8, r11 @ abs (p1-q1)
1471 uqadd8 r6, r6, r6 @ abs (p0-q0) * 2
1472 and r8, r7, r8, lsr #1 @ abs (p1-q1) / 2
1473 uqsub8 r11, r10, r9 @ q1 - q0
1474 uqadd8 r6, r8, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
1475 uqsub8 r12, r9, r10 @ q0 - q1
1476 uqsub8 r6, r6, r4 @ compare to flimit
1478 orr r9, r11, r12 @ abs (q1-q0)
1479 uqsub8 r8, r9, r2 @ compare to limit
1480 uqsub8 r10, r9, r3 @ compare to thresh
1484 mvn r11, #0 @ r11 == -1
1488 ldr r9, [sp] @ load the compared result
1489 sel lr, r11, r12 @ filter mask: lr
1492 beq 2f @ skip filtering
1495 @vp8_hevmask() function
1496 @calculate high edge variance
1497 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
1502 ldrh_post r8, r0, r1
1505 sel r6, r12, r11 @ hev mask: r6
1508 @ vp8_mbfilter() function
1509 @ p2, q2 are only needed at the end. do not need to load them in now.
1510 @ Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
1511 @ load soure data to r6, r11, r12, lr
1513 ldrh_post r10, r0, r1
1515 pkhbt r12, r7, r8, lsl #16
1518 ldrh_post r8, r0, r1
1520 pkhbt r11, r9, r10, lsl #16
1523 ldrh_post r10, r0, r1
1525 str r6, [sp] @ save r6
1526 str lr, [sp, #4] @ save lr
1528 pkhbt r6, r7, r8, lsl #16
1529 pkhbt lr, r9, r10, lsl #16
1531 @transpose r12, r11, r6, lr to p1, p0, q0, q1
1532 TRANSPOSE_MATRIX r12, r11, r6, lr, r10, r9, r8, r7
1534 @load back hev_mask r6 and filter_mask lr
1535 ldr r12, c0x80808080
1539 eor r7, r7, r12 @ ps1
1540 eor r8, r8, r12 @ ps0
1541 eor r9, r9, r12 @ qs0
1542 eor r10, r10, r12 @ qs1
1544 qsub8 r12, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
1545 str r7, [sp, #12] @ store ps1 temporarily
1546 qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1)
1547 str r10,[sp, #8] @ store qs1 temporarily
1549 str r9, [sp] @ store qs0 temporarily
1551 str r8, [sp, #4] @ store ps0 temporarily
1552 qadd8 r7, r7, r12 @ vp8_filter: r7
1554 ldr r10, c0x03030303 @ r10 = 3 --modified for vp8
1557 and r7, r7, lr @ vp8_filter &= mask (lr is free)
1559 mov r12, r7 @ Filter2: r12
1560 and r12, r12, r6 @ Filter2 &= hev
1562 @save bottom 3 bits so that we round one side +4 and the other +3
1563 qadd8 r8, r12, r9 @ Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
1564 qadd8 r12, r12, r10 @ Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
1567 shadd8 r8, r8, r10 @ Filter1 >>= 3
1568 shadd8 r12, r12, r10 @ Filter2 >>= 3
1570 shadd8 r12, r12, r10
1571 shadd8 r8, r8, r10 @ r8: Filter1
1572 shadd8 r12, r12, r10 @ r12: Filter2
1574 ldr r9, [sp] @ load qs0
1575 ldr r11,[sp, #4] @ load ps0
1577 qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(qs0 - Filter1)
1578 qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(ps0 + Filter2)
1580 bic r12, r7, r6 @vp8_filter &= ~hev ( r6 is free)
1582 @roughly 3/7th difference across boundary
1587 sxtb16 r10, r12, ror #8
1588 smlabb r8, r6, lr, r7
1589 smlatb r6, r6, lr, r7
1590 smlabb r7, r10, lr, r7
1592 ssat r8, #8, r8, asr #7
1593 ssat r6, #8, r6, asr #7
1595 ssat r7, #8, r7, asr #7
1596 ssat r10, #8, r10, asr #7
1600 pkhbt r6, r8, r6, lsl #16
1601 pkhbt r10, r7, r10, lsl #16
1605 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
1607 orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
1609 qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
1610 qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
1611 eor r8, r8, lr @ *oq0 = s^0x80
1612 eor r10, r10, lr @ *op0 = s^0x80
1614 strb r10,[r0, #-1] @ store op0 result
1615 strb_post r8, r0, r1 @ store oq0 result
1616 mov r10, r10, lsr #8
1619 strb_post r8, r0, r1
1620 mov r10, r10, lsr #8
1623 strb_post r8, r0, r1
1624 mov r10, r10, lsr #8
1627 strb_post r8, r0, r1
1629 @roughly 2/7th difference across boundary
1634 sxtb16 r10, r12, ror #8
1635 smlabb r8, r6, lr, r7
1636 smlatb r6, r6, lr, r7
1637 smlabb r9, r10, lr, r7
1638 smlatb r10, r10, lr, r7
1639 ssat r8, #8, r8, asr #7
1640 ssat r6, #8, r6, asr #7
1641 ssat r9, #8, r9, asr #7
1642 ssat r10, #8, r10, asr #7
1644 sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lines
1646 pkhbt r6, r8, r6, lsl #16
1647 pkhbt r10, r9, r10, lsl #16
1649 ldr r9, [sp, #8] @ load qs1
1650 ldr r11,[sp, #12] @ load ps1
1658 orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
1660 qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1 - u)
1661 qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps1 + u)
1662 eor r8, r8, lr @ *oq1 = s^0x80
1663 eor r10, r10, lr @ *op1 = s^0x80
1665 ldrb r11,[r0, #-5] @ load p2 for 1/7th difference across boundary
1666 strb r10,[r0, #-4] @ store op1
1667 strb r8, [r0, #-1] @ store oq1
1668 ldrb_post r9, r0, r1 @ load q2 for 1/7th difference across boundary
1670 mov r10, r10, lsr #8
1676 ldrb_post r7, r0, r1
1678 mov r10, r10, lsr #8
1680 orr r11, r11, r6, lsl #8
1681 orr r9, r9, r7, lsl #8
1686 ldrb_post r7, r0, r1
1688 mov r10, r10, lsr #8
1690 orr r11, r11, r6, lsl #16
1691 orr r9, r9, r7, lsl #16
1696 ldrb_post r7, r0, r1
1697 orr r11, r11, r6, lsl #24
1698 orr r9, r9, r7, lsl #24
1700 @roughly 1/7th difference across boundary
1708 sxtb16 r10, r12, ror #8
1709 smlabb r8, r6, lr, r7
1710 smlatb r6, r6, lr, r7
1711 smlabb r12, r10, lr, r7
1712 smlatb r10, r10, lr, r7
1713 ssat r8, #8, r8, asr #7
1714 ssat r6, #8, r6, asr #7
1715 ssat r12, #8, r12, asr #7
1716 ssat r10, #8, r10, asr #7
1718 sub r0, r0, r1, lsl #2
1720 pkhbt r6, r8, r6, lsl #16
1721 pkhbt r10, r12, r10, lsl #16
1728 orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
1730 qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2 + u)
1731 qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2 - u)
1732 eor r8, r8, lr @ *op2 = s^0x80
1733 eor r10, r10, lr @ *oq2 = s^0x80
1735 strb r8, [r0, #-5] @ store *op2
1736 strb_post r10, r0, r1 @ store *oq2
1738 mov r10, r10, lsr #8
1740 strb_post r10, r0, r1
1742 mov r10, r10, lsr #8
1744 strb_post r10, r0, r1
1746 mov r10, r10, lsr #8
1748 strb_post r10, r0, r1
1750 @adjust r0 pointer for next loop
1759 A ldrne r6, [r0], r1, lsl #1 @ load source data
1761 T addne r0, r0, r1, lsl #1
1764 A ldrne r8, [r0], r1, lsl #1
1766 T addne r0, r0, r1, lsl #1
1776 @ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src,
1777 @ int srcstride, int h, int mx, int my)
1778 function ff_put_vp8_pixels16_armv6, export=1
1780 ldr r12,[sp, #32] @ h
1791 strd r6, r7, [r0, #8]
1792 strd_post r4, r5, r0, r1
1793 strd r10, r11,[r0, #8]
1794 strd_post r8, r9, r0, r1
1800 @ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src,
1801 @ int srcstride, int h, int mx, int my)
1802 function ff_put_vp8_pixels8_armv6, export=1
1804 ldr r12,[sp, #32] @ h
1814 ldr_post r10, r2, r3
1815 strd_post r4, r5, r0, r1
1816 strd_post r6, r7, r0, r1
1817 strd_post r8, r9, r0, r1
1818 strd_post r10, r11, r0, r1
1824 @ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src,
1825 @ int srcstride, int h, int mx, int my)
1826 function ff_put_vp8_pixels4_armv6, export=1
1827 ldr r12, [sp, #0] @ h
1832 ldr_post r4, r2, r3, lsl #1
1834 ldr_post r6, r2, r3, lsl #1
1836 str_post r4, r0, r1, lsl #1
1838 str_post r6, r0, r1, lsl #1
1843 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1844 @ arithmatic can be used to apply filters
1845 const sixtap_filters_13245600, align=4
1846 .short 2, 108, -11, 36, -8, 1, 0, 0
1847 .short 3, 77, -16, 77, -16, 3, 0, 0
1848 .short 1, 36, -8, 108, -11, 2, 0, 0
1850 const fourtap_filters_1324, align=4
1851 .short -6, 12, 123, -1
1852 .short -9, 50, 93, -6
1853 .short -6, 93, 50, -9
1854 .short -1, 123, 12, -6
1857 @ void put_vp8_epel_h6(uint8_t *dst, int dststride, uint8_t *src,
1858 @ int srcstride, int w, int h, int mx)
1859 function ff_put_vp8_epel_h6_armv6, export=1
1863 movrel lr, sixtap_filters_13245600 - 16
1864 ldr r12,[sp, #44] @ vp8_filter index
1865 ldr r4, [sp, #36] @ width
1866 add lr, lr, r12, lsl #3
1867 sub r3, r3, r4 @ src_stride - block_width
1868 sub r1, r1, r4 @ dst_stride - block_width
1871 str r4, [sp, #36] @ "4-in-parallel" loop counter @40
1872 str r3, [sp, #44] @ src_stride - block_width @48
1873 push {r1} @ dst_stride - block_width @0
1876 ldr r1, [lr], #4 @ coefficients
1880 @ 3 loads, 10 shuffles and then mul/acc/add/shr
1881 @ o0: i0/i1/i2/i3/i4/i5 -> i0/i2 (ld1) | i1/i3 (ld1) | i4/i5 (ld2)
1882 @ o1: i1/i2/i3/i4/i5/i6 -> i1/i3 (ld1) | i2/i4 (ld2) | i5/i6 (ld2/3)
1883 @ o2: i2/i3/i4/i5/i6/i7 -> i2/i4 (ld2) | i3/i5 (ld2) | i6/i7 (ld3)
1884 @ o3: i3/i4/i5/i6/i7/i8 -> i3/i5 (ld2) | i4/i6 (ld2/3) | i7/i8 (ld3)
1885 ldr r7, [r2, #5] @ ld3 -> src[5-8]
1886 ldr r6, [r2, #2] @ ld2 -> src[2-5]
1887 ldr r5, [r2], #4 @ ld1 -> src[0-3]
1889 pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6]
1890 uxtb16 r9, r6, ror #8 @ src[5] | src[3]
1891 uxtb16 r6, r6 @ src[4] | src[2]
1892 uxtb16 r8, r5, ror #8 @ src[3] | src[1]
1893 uxtb16 r11, r7, ror #8 @ src[8] | src[7]
1894 uxtb16 r7, r7 @ src[7] | src[6]
1895 pkhtb r10, r9, r6, asr #16 @ src[5] | src[4]
1896 uxtb16 r5, r5 @ src[2] | src[0]
1898 smuad r11, r11, lr @ filter[3][2] -> r11
1900 pkhbt r12, r10, r7, lsl #16 @ src[6] | src[4]
1901 smuad r7, r7, lr @ filter[2][2] -> r7
1902 smuad r5, r5, r1 @ filter[0][0] -> r5
1903 smlad r11, r9, r1, r11 @ filter[3][0] -> r11
1904 smlad r7, r9, r3, r7 @ filter[2][1] -> r7
1905 smuad r9, r8, r1 @ filter[1][0] -> r9
1906 smlad r5, r8, r3, r5 @ filter[0][1] -> r5
1907 pkhtb r8, r12, r10, asr #16 @ src[6] | src[5]
1908 smlad r11, r12, r3, r11 @ filter[3][1] -> r11
1909 smlad r9, r6, r3, r9 @ filter[1][1] -> r9
1910 smlad r5, r10, lr, r5 @ filter[0][2] -> r5
1911 smlad r7, r6, r1, r7 @ filter[2][0] -> r7
1912 smlad r9, r8, lr, r9 @ filter[1][2] -> r9
1914 add r5, r5, #0x40 @ round_shift_and_clamp[0]
1915 add r9, r9, #0x40 @ round_shift_and_clamp[1]
1916 add r7, r7, #0x40 @ round_shift_and_clamp[2]
1917 add r11, r11, #0x40 @ round_shift_and_clamp[3]
1919 usat r5, #8, r5, asr #7
1920 usat r9, #8, r9, asr #7
1921 usat r7, #8, r7, asr #7
1922 usat r11, #8, r11, asr #7
1924 strb r5, [r0], #1 @ store res[0]
1925 strb r9, [r0], #1 @ store res[1]
1926 strb r7, [r0], #1 @ store res[2]
1927 strb r11,[r0], #1 @ store res[3]
1931 ldr r12,[sp, #44] @ height = outer-loop counter
1934 ldrne r4, [sp, #40] @ 4-in-parallel loop counter
1938 add r2, r2, r5 @ move to next input/output lines
1943 add sp, sp, #4 @ restore stack after push{r1} above
1947 @ void put_vp8_epel_v6(uint8_t *dst, int dststride, uint8_t *src,
1948 @ int srcstride, int w, int h, int my)
1949 function ff_put_vp8_epel_v6_armv6, export=1
1952 movrel lr, sixtap_filters_13245600 - 16
1953 ldr r12,[sp, #44] @ vp8_filter index
1954 ldr r4, [sp, #36] @ width
1955 add lr, lr, r12, lsl #3
1956 sub r1, r1, r4 @ dst_stride - block_width
1959 str r4, [sp, #36] @ "4-in-parallel" loop counter @40
1960 str r3, [sp, #44] @ src_stride - block_width @48
1961 push {r1} @ dst_stride - block_width @0
1964 add r1, r3, r3, lsl #1 @ stride * 3
1965 ldr_dpren r5, r2, r3 @ src[0,1,2,3 + stride * 1]
1966 ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3]
1967 ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4]
1968 ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5]
1970 @ byte -> word and "transpose"
1971 uxtb16 r9, r5, ror #8 @ src[3 + stride*1] | src[1 + stride*1]
1972 uxtb16 r10, r6, ror #8 @ src[3 + stride*3] | src[1 + stride*3]
1973 uxtb16 r11, r7, ror #8 @ src[3 + stride*4] | src[1 + stride*4]
1974 uxtb16 r12, r8, ror #8 @ src[3 + stride*5] | src[1 + stride*5]
1975 uxtb16 r5, r5 @ src[2 + stride*1] | src[0 + stride*1]
1976 uxtb16 r6, r6 @ src[2 + stride*3] | src[0 + stride*3]
1977 uxtb16 r7, r7 @ src[2 + stride*4] | src[0 + stride*4]
1978 uxtb16 r8, r8 @ src[2 + stride*5] | src[0 + stride*5]
1979 pkhbt r1, r9, r10, lsl #16 @ src[1 + stride*3] | src[1 + stride*1]
1980 pkhtb r9, r10, r9, asr #16 @ src[3 + stride*3] | src[3 + stride*1]
1981 pkhbt r10, r11, r12, lsl #16 @ src[1 + stride*5] | src[1 + stride*4]
1982 pkhtb r11, r12, r11, asr #16 @ src[3 + stride*5] | src[3 + stride*4]
1983 pkhbt r12, r5, r6, lsl #16 @ src[0 + stride*3] | src[0 + stride*1]
1984 pkhtb r5, r6, r5, asr #16 @ src[2 + stride*3] | src[2 + stride*1]
1985 pkhbt r6, r7, r8, lsl #16 @ src[0 + stride*5] | src[0 + stride*4]
1986 pkhtb r7, r8, r7, asr #16 @ src[2 + stride*5] | src[2 + stride*4]
1988 ldr r8, [lr, #4] @ stall - if only I had more registers...
1989 smuad r12, r12, r8 @ filter[0][1]
1990 smuad r1, r1, r8 @ filter[1][1]
1991 smuad r5, r5, r8 @ filter[2][1]
1992 smuad r9, r9, r8 @ filter[3][1]
1993 ldr r8, [lr, #8] @ stall - if only I had more registers...
1994 smlad r12, r6, r8, r12 @ filter[0][2]
1995 smlad r1, r10, r8, r1 @ filter[1][2]
1996 ldr_dpren r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0]
1997 ldr r10,[r2], #4 @ src[0,1,2,3 + stride * 2]
1998 smlad r5, r7, r8, r5 @ filter[2][2]
1999 smlad r9, r11, r8, r9 @ filter[3][2]
2001 uxtb16 r7, r6, ror #8 @ src[3 + stride*0] | src[1 + stride*0]
2002 uxtb16 r11, r10, ror #8 @ src[3 + stride*2] | src[1 + stride*2]
2003 uxtb16 r6, r6 @ src[2 + stride*0] | src[0 + stride*0]
2004 uxtb16 r10, r10 @ src[2 + stride*2] | src[0 + stride*2]
2006 pkhbt r8, r7, r11, lsl #16 @ src[1 + stride*2] | src[1 + stride*0]
2007 pkhtb r7, r11, r7, asr #16 @ src[3 + stride*2] | src[3 + stride*0]
2008 pkhbt r11, r6, r10, lsl #16 @ src[0 + stride*2] | src[0 + stride*0]
2009 pkhtb r6, r10, r6, asr #16 @ src[2 + stride*2] | src[2 + stride*0]
2011 ldr r10,[lr] @ stall - if only I had more registers...
2012 subs r4, r4, #1 @ counter--
2013 smlad r12, r11, r10, r12 @ filter[0][0]
2014 smlad r1, r8, r10, r1 @ filter[1][0]
2015 smlad r5, r6, r10, r5 @ filter[2][0]
2016 smlad r9, r7, r10, r9 @ filter[3][0]
2018 add r12, r12, #0x40 @ round_shift_and_clamp[0]
2019 add r1, r1, #0x40 @ round_shift_and_clamp[1]
2020 add r5, r5, #0x40 @ round_shift_and_clamp[2]
2021 add r9, r9, #0x40 @ round_shift_and_clamp[3]
2023 usat r12, #8, r12, asr #7
2024 usat r1, #8, r1, asr #7
2025 usat r5, #8, r5, asr #7
2026 usat r9, #8, r9, asr #7
2028 strb r12,[r0], #1 @ store res[0]
2029 strb r1, [r0], #1 @ store res[1]
2030 strb r5, [r0], #1 @ store res[2]
2031 strb r9, [r0], #1 @ store res[3]
2035 ldr r12,[sp, #44] @ height = outer-loop counter
2038 ldrne r4, [sp, #40] @ 4-in-parallel loop counter
2040 subne r2, r2, r4, lsl #2
2043 add r2, r2, r3 @ move to next input/output lines
2047 add sp, sp, #4 @ restore stack after push{r1} above
2051 @ void put_vp8_epel_h4(uint8_t *dst, int dststride, uint8_t *src,
2052 @ int srcstride, int w, int h, int mx)
2053 function ff_put_vp8_epel_h4_armv6, export=1
2057 movrel lr, fourtap_filters_1324 - 4
2058 ldr r4, [sp, #36] @ width
2059 ldr r12,[sp, #44] @ vp8_filter index
2060 add lr, lr, r12, lsl #2
2061 sub r3, r3, r4 @ src_stride - block_width
2062 sub r1, r1, r4 @ dst_stride - block_width
2067 ldr lr, [sp, #40] @ height = outer-loop counter
2068 str r4, [sp, #36] @ "4-in-parallel" inner loop counter
2070 @ 3 loads, 5 uxtb16s and then mul/acc/add/shr
2071 @ o0: i0/i1/i2/i3 -> i0/i2(ld1) + i1/i3(ld1)
2072 @ o1: i1/i2/i3/i4 -> i1/i3(ld1) + i2/i4(ld2)
2073 @ o2: i2/i3/i4/i5 -> i2/i4(ld2) + i3/i5(ld2)
2074 @ o3: i3/i4/i5/i6 -> i3/i5(ld2) + i4/i6(ld3)
2075 ldr r9, [r2, #3] @ load source data
2079 uxtb16 r9, r9, ror #8 @ src[6] | src[4]
2080 uxtb16 r10, r8, ror #8 @ src[5] | src[3]
2081 uxtb16 r8, r8 @ src[4] | src[2]
2082 uxtb16 r11, r7, ror #8 @ src[3] | src[1]
2083 uxtb16 r7, r7 @ src[2] | src[0]
2085 smuad r9, r9, r6 @ filter[3][1] -> r9
2086 smuad r12, r10, r6 @ filter[2][1] -> r12
2087 smuad r7, r7, r5 @ filter[0][0] -> r7
2088 smlad r9, r10, r5, r9 @ filter[3][0] -> r9
2089 smuad r10, r11, r5 @ filter[1][0] -> r10
2090 smlad r12, r8, r5, r12 @ filter[2][0] -> r12
2091 smlad r7, r11, r6, r7 @ filter[0][1] -> r7
2092 smlad r10, r8, r6, r10 @ filter[1][1] -> r10
2094 subs r4, r4, #1 @ counter--
2096 add r7, r7, #0x40 @ round_shift_and_clamp[0]
2097 add r10, r10, #0x40 @ round_shift_and_clamp[1]
2098 add r12, r12, #0x40 @ round_shift_and_clamp[2]
2099 add r9, r9, #0x40 @ round_shift_and_clamp[3]
2101 usat r7, #8, r7, asr #7
2102 usat r10, #8, r10, asr #7
2103 usat r12, #8, r12, asr #7
2104 usat r9, #8, r9, asr #7
2106 strb r7, [r0], #1 @ store res[0]
2107 strb r10,[r0], #1 @ store res[1]
2108 strb r12,[r0], #1 @ store res[2]
2109 strb r9, [r0], #1 @ store res[3]
2115 ldrne r4, [sp, #36] @ 4-in-parallel loop counter
2116 add r2, r2, r3 @ move to next input/output lines
2124 @ void put_vp8_epel_v4(uint8_t *dst, int dststride, uint8_t *src,
2125 @ int srcstride, int w, int h, int my)
2126 function ff_put_vp8_epel_v4_armv6, export=1
2129 movrel lr, fourtap_filters_1324 - 4
2130 ldr r12,[sp, #44] @ vp8_filter index
2131 ldr r4, [sp, #36] @ width
2132 add lr, lr, r12, lsl #2
2133 sub r1, r1, r4 @ dst_stride - block_width
2138 str r4, [sp, #36] @ "4-in-parallel" loop counter @40
2139 str r3, [sp, #44] @ src_stride @48
2140 push {r1} @ dst_stride - block_width @36
2143 ldr lr, [r2, r3, lsl #1] @ load source pixels
2145 ldr_dpren r7, r2, r3
2148 @ byte -> word and "transpose"
2149 uxtb16 r8, lr, ror #8 @ src[3 + stride*3] | src[1 + stride*3]
2150 uxtb16 r9, r12, ror #8 @ src[3 + stride*2] | src[1 + stride*2]
2151 uxtb16 r3, r7, ror #8 @ src[3 + stride*0] | src[1 + stride*0]
2152 uxtb16 r1, r11, ror #8 @ src[3 + stride*1] | src[1 + stride*1]
2153 uxtb16 lr, lr @ src[2 + stride*3] | src[0 + stride*3]
2154 uxtb16 r12, r12 @ src[2 + stride*2] | src[0 + stride*2]
2155 uxtb16 r7, r7 @ src[2 + stride*0] | src[0 + stride*0]
2156 uxtb16 r11, r11 @ src[2 + stride*1] | src[0 + stride*1]
2157 pkhbt r10, r1, r8, lsl #16 @ src[1 + stride*3] | src[1 + stride*1]
2158 pkhtb r1, r8, r1, asr #16 @ src[3 + stride*3] | src[3 + stride*1]
2159 pkhbt r8, r3, r9, lsl #16 @ src[1 + stride*2] | src[1 + stride*0]
2160 pkhtb r3, r9, r3, asr #16 @ src[3 + stride*2] | src[3 + stride*0]
2161 pkhbt r9, r11, lr, lsl #16 @ src[0 + stride*3] | src[0 + stride*1]
2162 pkhtb r11, lr, r11, asr #16 @ src[2 + stride*3] | src[2 + stride*1]
2163 pkhbt lr, r7, r12, lsl #16 @ src[0 + stride*2] | src[0 + stride*0]
2164 pkhtb r7, r12, r7, asr #16 @ src[2 + stride*2] | src[2 + stride*0]
2166 smuad r9, r9, r6 @ filter[0][1]
2167 smuad r10, r10, r6 @ filter[1][1]
2168 smuad r11, r11, r6 @ filter[2][1]
2169 smuad r1, r1, r6 @ filter[3][1]
2170 smlad r9, lr, r5, r9 @ filter[0][0]
2171 smlad r10, r8, r5, r10 @ filter[1][0]
2172 smlad r11, r7, r5, r11 @ filter[2][0]
2173 smlad r1, r3, r5, r1 @ filter[3][0]
2175 subs r4, r4, #1 @ counter--
2176 ldr r3, [sp, #48] @ FIXME prevent clobber of r3 above?
2178 add r9, r9, #0x40 @ round_shift_and_clamp[0]
2179 add r10, r10, #0x40 @ round_shift_and_clamp[1]
2180 add r11, r11, #0x40 @ round_shift_and_clamp[2]
2181 add r1, r1, #0x40 @ round_shift_and_clamp[3]
2183 usat r9, #8, r9, asr #7
2184 usat r10, #8, r10, asr #7
2185 usat r11, #8, r11, asr #7
2186 usat r1, #8, r1, asr #7
2188 strb r9, [r0], #1 @ store result
2195 ldr r12,[sp, #44] @ height = outer-loop counter
2198 ldrne r4, [sp, #40] @ 4-in-parallel loop counter
2201 sub r2, r2, r4, lsl #2
2203 add r2, r2, r3 @ move to next input/output lines
2207 add sp, sp, #4 @ restore stack after push{r1} above
2211 @ void put_vp8_bilin_h(uint8_t *dst, int dststride, uint8_t *src,
2212 @ int srcstride, int w, int h, int mx)
2213 function ff_put_vp8_bilin_h_armv6, export=1
2216 ldr r8, [sp, #36] @ vp8_filter index
2217 ldr r12,[sp, #32] @ height = outer-loop counter
2218 ldr r4, [sp, #28] @ width
2219 lsl r5, r8, #16 @ mx << 16
2220 sub r3, r3, r4 @ src_stride - block_width
2221 sub r1, r1, r4 @ dst_stride - block_width
2223 sub r5, r5, r8 @ (mx << 16) | (-mx)
2224 str r4, [sp, #28] @ "4-in-parallel" loop counter
2225 add r5, r5, #8 @ (8 - mx) | (mx << 16) = filter coefficients
2227 ldrb r6, [r2], #1 @ load source data
2233 pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0]
2234 pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1]
2235 pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2]
2236 pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3]
2238 smuad r6, r6, r5 @ apply the filter
2243 subs r4, r4, #1 @ counter--
2245 add r6, r6, #0x4 @ round_shift_and_clamp
2252 pkhbt r6, r6, r8, lsl #13
2253 pkhbt r7, r7, r9, lsl #13
2254 orr r6, r6, r7, lsl #8
2255 str r6, [r0], #4 @ store result
2259 ldr r4, [sp, #28] @ 4-in-parallel loop counter
2262 add r2, r2, r3 @ move to next input/output lines
2270 @ void put_vp8_bilin_v(uint8_t *dst, int dststride, uint8_t *src,
2271 @ int srcstride, int w, int h, int my)
2272 function ff_put_vp8_bilin_v_armv6, export=1
2275 ldr r11,[sp, #44] @ vp8_filter index
2276 ldr r4, [sp, #36] @ width
2277 mov r5, r11, lsl #16 @ mx << 16
2278 ldr r12,[sp, #40] @ height = outer-loop counter
2280 sub r5, r5, r11 @ (mx << 16) | (-mx)
2282 add r5, r5, #8 @ (8 - mx) | (mx << 16) = filter coefficients
2283 str r4, [sp, #36] @ "4-in-parallel" loop counter
2285 ldrb r10,[r2, r3] @ load the data
2292 pkhbt r6, r6, r10, lsl #16
2294 pkhbt r7, r7, r11, lsl #16
2295 pkhbt r8, r8, lr, lsl #16
2296 pkhbt r9, r10, r9, lsl #16
2298 smuad r6, r6, r5 @ apply the filter
2303 subs r4, r4, #1 @ counter--
2305 add r6, r6, #0x4 @ round_shift_and_clamp
2312 pkhbt r6, r6, r8, lsl #13
2313 pkhbt r7, r7, r9, lsl #13
2314 orr r6, r6, r7, lsl #8
2315 str r6, [r0], #4 @ store result
2319 ldr r4, [sp, #36] @ 4-in-parallel loop counter
2322 add r2, r2, r3 @ move to next input/output lines
2324 sub r2, r2, r4, lsl #2