- qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(qs0 - Filter1)
- qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-
- bic r12, r7, r6 @ vp8_filter &= ~hev ( r6 is free)
-
- @roughly 3/7th difference across boundary
- mov lr, #0x1b @ 27
- mov r7, #0x3f @ 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r7, r10, lr, r7
- smultb r10, r10, lr
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- add r10, r10, #63
- ssat r7, #8, r7, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r7, r10, lsl #16
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub r0, r0, r1
-
- orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
- qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
- eor r8, r8, lr @ *oq0 = s^0x80
- str r8, [r0] @ store *oq0
- sub r0, r0, r1
- eor r10, r10, lr @ *op0 = s^0x80
- str r10,[r0] @ store *op0
-
- @roughly 2/7th difference across boundary
- mov lr, #0x12 @ 18
- mov r7, #0x3f @ 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r9, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r9, #8, r9, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r9, r10, lsl #16
-
- ldr r9, [sp, #8] @ load qs1
- ldr r11, [sp, #12] @ load ps1
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub r0, r0, r1
-
- orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-
- qadd8 r11, r11, r10 @ s = vp8_signed_char_clamp(ps1 + u)
- qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1 - u)
- eor r11, r11, lr @ *op1 = s^0x80
- str_post r11, r0, r1 @ store *op1
- eor r8, r8, lr @ *oq1 = s^0x80
- add r0, r0, r1, lsl #1
-
- mov r7, #0x3f @ 63
-
- str_post r8, r0, r1 @ store *oq1
-
- @roughly 1/7th difference across boundary
- mov lr, #0x9 @ 9
- ldr r9, [r0] @ load q2
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r12, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r12, #8, r12, asr #7
- ssat r10, #8, r10, asr #7
-
- sub r0, r0, r1, lsl #2
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r12, r10, lsl #16
-
- sub r0, r0, r1
- ldr lr, c0x80808080
-
- ldr r11, [r0] @ load p2
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- eor r9, r9, lr
- eor r11, r11, lr
-
- orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-
- qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2 + u)
- qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2 - u)
- eor r8, r8, lr @ *op2 = s^0x80
- str_post r8, r0, r1, lsl #2 @ store *op2
- add r0, r0, r1
- eor r10, r10, lr @ *oq2 = s^0x80
- str_post r10, r0, r1, lsl #1 @ store *oq2
-
-2:
- add r0, r0, #4
- sub r0, r0, r1, lsl #3
- subs r5, r5, #1
-
-T ittt ne
- ldrne r10,[r0, r1] @ p2
-A ldrne r9, [r0], r1, lsl #1 @ p3
-T ldrne r9, [r0] @ p3
-T addne r0, r0, r1, lsl #1
-T ittt ne
- ldrne r12,[r0, r1] @ p0
-A ldrne r11,[r0], r1, lsl #1 @ p1
-T ldrne r11,[r0] @ p3
-T addne r0, r0, r1, lsl #1
-
- bne 1b
-
- add sp, sp, #16
- pop {r4 - r11, pc}
-endfunc
-
-.macro TRANSPOSE_MATRIX i0, i1, i2, i3, o3, o2, o1, o0
- @ input: $0, $1, $2, $3
- @ output: $4, $5, $6, $7
- @ i0: 03 02 01 00
- @ i1: 13 12 11 10
- @ i2: 23 22 21 20
- @ i3: 33 32 31 30
- @ o3 o2 o1 o0
-
- uxtb16 \o1, \i1 @ xx 12 xx 10
- uxtb16 \o0, \i0 @ xx 02 xx 00
- uxtb16 \o3, \i3 @ xx 32 xx 30
- uxtb16 \o2, \i2 @ xx 22 xx 20
- orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00
- orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20
-
- uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11
- uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31
- uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01
- uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21
- orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01
- orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21
-
- pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 -- p1
- pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 -- p3
-
- pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 -- p0
- pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 -- p2
-.endm
-
-@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim)
-function ff_vp8_h_loop_filter16_simple_armv6, export=1
- push {r4 - r11, lr}
- orr r12, r2, r2, lsl #16
- ldr r2, c0x80808080
- orr r12, r12, r12, lsl #8
-
- @ load soure data to r7, r8, r9, r10
- sub r0, r0, #2
- ldr r8, [r0, r1]
- ldr_post r7, r0, r1, lsl #1
- ldr r10,[r0, r1]
- ldr_post r9, r0, r1, lsl #1
- add r0, r0, #2
-
- mov r11, #4 @ count (r11) for 4-in-parallel
-1:
- @transpose r7, r8, r9, r10 to r3, r4, r5, r6
- TRANSPOSE_MATRIX r7, r8, r9, r10, r6, r5, r4, r3
-
- @ vp8_simple_filter_mask() function
- uqsub8 r7, r3, r6 @ p1 - q1
- uqsub8 r8, r6, r3 @ q1 - p1
- uqsub8 r9, r4, r5 @ p0 - q0
- uqsub8 r10, r5, r4 @ q0 - p0
- orr r7, r7, r8 @ abs(p1 - q1)
- orr r9, r9, r10 @ abs(p0 - q0)
- mov r8, #0
- uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2
- uhadd8 r7, r7, r8 @ abs(p1 - q1) / 2
- uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1 - q1)/2
- mvn r10, #0 @ r10 == -1
-
- usub8 r7, r12, r7 @ compare to flimit
- sel lr, r10, r8 @ filter mask
-
- cmp lr, #0
- beq 2f @ skip filtering
-
- @vp8_simple_filter() function
- eor r3, r3, r2 @ p1 offset to convert to a signed value
- eor r6, r6, r2 @ q1 offset to convert to a signed value
- eor r4, r4, r2 @ p0 offset to convert to a signed value
- eor r5, r5, r2 @ q0 offset to convert to a signed value
-
- qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
- qsub8 r6, r5, r4 @ q0 - p0
-
- qadd8 r3, r3, r6 @ vp8_filter += q0 - p0
- ldr r9, c0x03030303 @ r9 = 3
-
- qadd8 r3, r3, r6 @ vp8_filter += q0 - p0
- ldr r7, c0x04040404
-
- qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0))
- @STALL
- and r3, r3, lr @ vp8_filter &= mask
-
- qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3
- qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4
-
- shadd8 r9, r9, r8
- shadd8 r3, r3, r8
- shadd8 r9, r9, r8
- shadd8 r3, r3, r8
- shadd8 r9, r9, r8 @ Filter2 >>= 3
- shadd8 r3, r3, r8 @ Filter1 >>= 3
-
- @calculate output
- sub r0, r0, r1, lsl #2
-
- qadd8 r4, r4, r9 @ u = p0 + Filter2
- qsub8 r5, r5, r3 @ u = q0 - Filter1
- eor r4, r4, r2 @ *op0 = u^0x80
- eor r5, r5, r2 @ *oq0 = u^0x80
-
- strb r4, [r0, #-1] @ store the result
- mov r4, r4, lsr #8
- strb_post r5, r0, r1
- mov r5, r5, lsr #8
-
- strb r4, [r0, #-1]
- mov r4, r4, lsr #8
- strb_post r5, r0, r1
- mov r5, r5, lsr #8
-
- strb r4, [r0, #-1]
- mov r4, r4, lsr #8
- strb_post r5, r0, r1
- mov r5, r5, lsr #8
-
- strb r4, [r0, #-1]
- strb_post r5, r0, r1
-
-2:
- subs r11, r11, #1
-
- @ load soure data to r7, r8, r9, r10
- sub r0, r0, #2
-T ittt ne
- ldrne r8, [r0, r1]
-A ldrne r7, [r0], r1, lsl #1
-T ldrne r7, [r0]
-T addne r0, r0, r1, lsl #1
-T ittt ne
- ldrne r10,[r0, r1]
-A ldrne r9, [r0], r1, lsl #1
-T ldrne r9, [r0]
-T addne r0, r0, r1, lsl #1
- add r0, r0, #2
-
- bne 1b
-
- pop {r4 - r11, pc}
-endfunc
-
-@ void vp8_h_loop_filter16_inner(uint8_t *dst, int stride,
-@ int fE, int fI, int hev_thresh)
-@ and
-@ void vp8_h_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
-@ int fE, int fI, int hev_thresh)
-@ call:
-@ void vp8_h_loop_filter_inner(uint8_t *dst, int stride,
-@ int fE, int fI, int hev_thresh, int count)
-function ff_vp8_h_loop_filter_inner_armv6, export=1
- push {r4 - r11, lr}
-
- sub r0, r0, #4 @ move r0 pointer down by 4
- ldr r5, [sp, #40] @ counter
- ldr r9, [sp, #36] @ load thresh address
- sub sp, sp, #16 @ create temp buffer
-
- ldr r7, [r0, r1] @ transpose will make it into p3-p0
- ldr_post r6, r0, r1, lsl #1 @ load source data
- ldr lr, [r0, r1]
- ldr_post r8, r0, r1, lsl #1
-
- orr r2, r2, r2, lsl #16