#include "asm.S"
- .fpu neon
-
.macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
vtrn.32 \r0, \r4
vtrn.32 \r1, \r5
vtrn.8 \r6, \r7
.endm
+ .macro transpose_4x4 r0 r1 r2 r3
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ .endm
+
.macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
vswp \r0, \r4
vswp \r1, \r5
bgt 5b
pop {r4-r7, pc}
- .endfunc
+endfunc
.endm
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
bgt 5b
pop {r4-r7, pc}
- .endfunc
+endfunc
.endm
+ .macro h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ ldr lr, [sp, #20]
+ pld [r1]
+ pld [r1, r2]
+ orrs r5, r4, lr
+ beq 2f
+
+ mul r5, r4, lr
+ rsb r6, r5, lr, lsl #3
+ rsb r12, r5, r4, lsl #3
+ sub r4, r5, r4, lsl #3
+ sub r4, r4, lr, lsl #3
+ add r4, r4, #64
+ vdup.8 d0, r4
+ vdup.8 d2, r12
+ vdup.8 d1, r6
+ vdup.8 d3, r5
+ vtrn.16 q0, q1
+1:
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1], r2
+ vrev64.32 d5, d4
+ vld1.32 {d5[1]}, [r1]
+ vext.8 q3, q2, q2, #1
+ vtrn.16 q2, q3
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+.ifc \type,avg
+ vld1.16 {d18[0]}, [r0,:16], r2
+ vld1.16 {d18[1]}, [r0,:16]
+ sub r0, r0, r2
+.endif
+ vtrn.32 d16, d17
+ vadd.i16 d16, d16, d17
+ vrshrn.u16 d16, q8, #6
+.ifc \type,avg
+ vrhadd.u8 d16, d16, d18
+.endif
+ vst1.16 {d16[0]}, [r0,:16], r2
+ vst1.16 {d16[1]}, [r0,:16], r2
+ subs r3, r3, #2
+ bgt 1b
+ pop {r4-r6, pc}
+2:
+.ifc \type,put
+ ldrh r5, [r1], r2
+ strh r5, [r0], r2
+ ldrh r6, [r1], r2
+ strh r6, [r0], r2
+.else
+ vld1.16 {d16[0]}, [r1], r2
+ vld1.16 {d16[1]}, [r1], r2
+ vld1.16 {d18[0]}, [r0,:16], r2
+ vld1.16 {d18[1]}, [r0,:16]
+ sub r0, r0, r2
+ vrhadd.u8 d16, d16, d18
+ vst1.16 {d16[0]}, [r0,:16], r2
+ vst1.16 {d16[1]}, [r0,:16], r2
+.endif
+ subs r3, r3, #2
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+.endm
+
.text
.align
h264_chroma_mc8 avg
h264_chroma_mc4 put
h264_chroma_mc4 avg
+ h264_chroma_mc2 put
+ h264_chroma_mc2 avg
/* H.264 loop filter */
align_pop_regs
bx lr
- .endfunc
+endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
align_push_regs
- sub sp, sp, #16
- vst1.64 {d4, d5}, [sp,:128]
- sub sp, sp, #16
- vst1.64 {d20,d21}, [sp,:128]
h264_loop_filter_luma
- vld1.64 {d20,d21}, [sp,:128]!
- vld1.64 {d4, d5}, [sp,:128]!
-
- transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
+ transpose_4x4 q4, q8, q0, q5
sub r0, r0, r1, lsl #4
- vst1.64 {d6}, [r0], r1
- vst1.64 {d20}, [r0], r1
- vst1.64 {d8}, [r0], r1
- vst1.64 {d16}, [r0], r1
- vst1.64 {d0}, [r0], r1
- vst1.64 {d10}, [r0], r1
- vst1.64 {d4}, [r0], r1
- vst1.64 {d26}, [r0], r1
- vst1.64 {d7}, [r0], r1
- vst1.64 {d21}, [r0], r1
- vst1.64 {d9}, [r0], r1
- vst1.64 {d17}, [r0], r1
- vst1.64 {d1}, [r0], r1
- vst1.64 {d11}, [r0], r1
- vst1.64 {d5}, [r0], r1
- vst1.64 {d27}, [r0], r1
+ add r0, r0, #2
+ vst1.32 {d8[0]}, [r0], r1
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d10[0]}, [r0], r1
+ vst1.32 {d8[1]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d10[1]}, [r0], r1
+ vst1.32 {d9[0]}, [r0], r1
+ vst1.32 {d17[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ vst1.32 {d11[0]}, [r0], r1
+ vst1.32 {d9[1]}, [r0], r1
+ vst1.32 {d17[1]}, [r0], r1
+ vst1.32 {d1[1]}, [r0], r1
+ vst1.32 {d11[1]}, [r0], r1
align_pop_regs
bx lr
- .endfunc
+endfunc
.macro h264_loop_filter_chroma
vdup.8 d22, r2 @ alpha
vst1.64 {d0}, [r0,:64], r1
bx lr
- .endfunc
+endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
vst1.32 {d2[1]}, [r0], r1
bx lr
- .endfunc
+endfunc
/* H.264 qpel MC */
mov ip, #16
mov lr, r4
b put_h264_qpel8_h_lowpass_neon
- .endfunc
+endfunc
-function put_h264_qpel16_h_lowpass_neon
+ .macro h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
push {lr}
mov ip, #16
- bl put_h264_qpel8_h_lowpass_neon
+ bl \type\()_h264_qpel8_h_lowpass_neon
sub r0, r0, r3, lsl #4
sub r1, r1, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
mov ip, #16
pop {lr}
- .endfunc
+endfunc
-function put_h264_qpel8_h_lowpass_neon
+function \type\()_h264_qpel8_h_lowpass_neon
1: vld1.64 {d0, d1}, [r1], r2
vld1.64 {d16,d17}, [r1], r2
subs ip, ip, #2
lowpass_8 d0, d1, d16, d17, d0, d16
+.ifc \type,avg
+ vld1.8 {d2}, [r0,:64], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 d16, d16, d3
+ sub r0, r0, r3
+.endif
vst1.64 {d0}, [r0,:64], r3
vst1.64 {d16}, [r0,:64], r3
bne 1b
bx lr
- .endfunc
+endfunc
+ .endm
-function put_h264_qpel16_h_lowpass_l2_neon
+ h264_qpel_h_lowpass put
+ h264_qpel_h_lowpass avg
+
+ .macro h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
push {lr}
mov ip, #16
- bl put_h264_qpel8_h_lowpass_l2_neon
+ bl \type\()_h264_qpel8_h_lowpass_l2_neon
sub r0, r0, r2, lsl #4
sub r1, r1, r2, lsl #4
sub r3, r3, r2, lsl #4
add r3, r3, #8
mov ip, #16
pop {lr}
- .endfunc
+endfunc
-function put_h264_qpel8_h_lowpass_l2_neon
+function \type\()_h264_qpel8_h_lowpass_l2_neon
1: vld1.64 {d0, d1}, [r1], r2
vld1.64 {d16,d17}, [r1], r2
vld1.64 {d28}, [r3], r2
subs ip, ip, #2
lowpass_8 d0, d1, d16, d17, d0, d1
vrhadd.u8 q0, q0, q14
+.ifc \type,avg
+ vld1.8 {d2}, [r0,:64], r2
+ vrhadd.u8 d0, d0, d2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 d1, d1, d3
+ sub r0, r0, r2
+.endif
vst1.64 {d0}, [r0,:64], r2
vst1.64 {d1}, [r0,:64], r2
bne 1b
bx lr
- .endfunc
+endfunc
+ .endm
+
+ h264_qpel_h_lowpass_l2 put
+ h264_qpel_h_lowpass_l2 avg
function put_h264_qpel16_v_lowpass_neon_packed
mov r4, lr
sub r1, r1, r3, lsl #2
mov lr, r4
b put_h264_qpel8_v_lowpass_neon
- .endfunc
+endfunc
-function put_h264_qpel16_v_lowpass_neon
+ .macro h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
mov r4, lr
- bl put_h264_qpel8_v_lowpass_neon
+ bl \type\()_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
- bl put_h264_qpel8_v_lowpass_neon
+ bl \type\()_h264_qpel8_v_lowpass_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
- bl put_h264_qpel8_v_lowpass_neon
+ bl \type\()_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r4
- .endfunc
+endfunc
-function put_h264_qpel8_v_lowpass_neon
+function \type\()_h264_qpel8_v_lowpass_neon
vld1.64 {d8}, [r1], r3
vld1.64 {d10}, [r1], r3
vld1.64 {d12}, [r1], r3
lowpass_8 d26, d27, d28, d29, d26, d28
transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
+.ifc \type,avg
+ vld1.8 {d9}, [r0,:64], r2
+ vrhadd.u8 d8, d8, d9
+ vld1.8 {d11}, [r0,:64], r2
+ vrhadd.u8 d10, d10, d11
+ vld1.8 {d13}, [r0,:64], r2
+ vrhadd.u8 d12, d12, d13
+ vld1.8 {d15}, [r0,:64], r2
+ vrhadd.u8 d14, d14, d15
+ vld1.8 {d23}, [r0,:64], r2
+ vrhadd.u8 d22, d22, d23
+ vld1.8 {d25}, [r0,:64], r2
+ vrhadd.u8 d24, d24, d25
+ vld1.8 {d27}, [r0,:64], r2
+ vrhadd.u8 d26, d26, d27
+ vld1.8 {d29}, [r0,:64], r2
+ vrhadd.u8 d28, d28, d29
+ sub r0, r0, r2, lsl #3
+.endif
+
vst1.64 {d8}, [r0,:64], r2
vst1.64 {d10}, [r0,:64], r2
vst1.64 {d12}, [r0,:64], r2
vst1.64 {d28}, [r0,:64], r2
bx lr
- .endfunc
+endfunc
+ .endm
-function put_h264_qpel16_v_lowpass_l2_neon
+ h264_qpel_v_lowpass put
+ h264_qpel_v_lowpass avg
+
+ .macro h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
mov r4, lr
- bl put_h264_qpel8_v_lowpass_l2_neon
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r1, r1, r3, lsl #2
- bl put_h264_qpel8_v_lowpass_l2_neon
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r0, r0, r3, lsl #4
sub ip, ip, r2, lsl #4
add r0, r0, #8
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
- bl put_h264_qpel8_v_lowpass_l2_neon
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r1, r1, r3, lsl #2
mov lr, r4
- .endfunc
+endfunc
-function put_h264_qpel8_v_lowpass_l2_neon
+function \type\()_h264_qpel8_v_lowpass_l2_neon
vld1.64 {d8}, [r1], r3
vld1.64 {d10}, [r1], r3
vld1.64 {d12}, [r1], r3
vld1.64 {d10}, [ip], r2
vrhadd.u8 q2, q2, q11
vld1.64 {d11}, [ip], r2
+ vrhadd.u8 q5, q5, q13
+
+.ifc \type,avg
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d0, d0, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d1, d1, d17
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d2, d2, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d3, d3, d17
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d4, d4, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d5, d5, d17
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d10, d10, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d11, d11, d17
+ sub r0, r0, r3, lsl #3
+.endif
vst1.64 {d0}, [r0,:64], r3
vst1.64 {d1}, [r0,:64], r3
- vrhadd.u8 q5, q5, q13
vst1.64 {d2}, [r0,:64], r3
vst1.64 {d3}, [r0,:64], r3
vst1.64 {d4}, [r0,:64], r3
vst1.64 {d11}, [r0,:64], r3
bx lr
- .endfunc
+endfunc
+ .endm
+
+ h264_qpel_v_lowpass_l2 put
+ h264_qpel_v_lowpass_l2 avg
function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const ip
transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
bx lr
- .endfunc
+endfunc
-function put_h264_qpel8_hv_lowpass_neon
+ .macro h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
mov r10, lr
bl put_h264_qpel8_hv_lowpass_neon_top
+.ifc \type,avg
+ vld1.8 {d0}, [r0,:64], r2
+ vrhadd.u8 d12, d12, d0
+ vld1.8 {d1}, [r0,:64], r2
+ vrhadd.u8 d13, d13, d1
+ vld1.8 {d2}, [r0,:64], r2
+ vrhadd.u8 d14, d14, d2
+ vld1.8 {d3}, [r0,:64], r2
+ vrhadd.u8 d15, d15, d3
+ vld1.8 {d4}, [r0,:64], r2
+ vrhadd.u8 d8, d8, d4
+ vld1.8 {d5}, [r0,:64], r2
+ vrhadd.u8 d9, d9, d5
+ vld1.8 {d6}, [r0,:64], r2
+ vrhadd.u8 d10, d10, d6
+ vld1.8 {d7}, [r0,:64], r2
+ vrhadd.u8 d11, d11, d7
+ sub r0, r0, r2, lsl #3
+.endif
vst1.64 {d12}, [r0,:64], r2
vst1.64 {d13}, [r0,:64], r2
vst1.64 {d14}, [r0,:64], r2
mov lr, r10
bx lr
- .endfunc
+endfunc
+ .endm
-function put_h264_qpel8_hv_lowpass_l2_neon
+ h264_qpel8_hv_lowpass put
+ h264_qpel8_hv_lowpass avg
+
+ .macro h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov r10, lr
bl put_h264_qpel8_hv_lowpass_neon_top
vrhadd.u8 q1, q1, q7
vld1.64 {d6, d7}, [r2,:128]!
vrhadd.u8 q2, q2, q4
-
- vst1.64 {d0}, [r0,:64], r3
vrhadd.u8 q3, q3, q5
+.ifc \type,avg
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d0, d0, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d1, d1, d17
+ vld1.8 {d18}, [r0,:64], r3
+ vrhadd.u8 d2, d2, d18
+ vld1.8 {d19}, [r0,:64], r3
+ vrhadd.u8 d3, d3, d19
+ vld1.8 {d20}, [r0,:64], r3
+ vrhadd.u8 d4, d4, d20
+ vld1.8 {d21}, [r0,:64], r3
+ vrhadd.u8 d5, d5, d21
+ vld1.8 {d22}, [r0,:64], r3
+ vrhadd.u8 d6, d6, d22
+ vld1.8 {d23}, [r0,:64], r3
+ vrhadd.u8 d7, d7, d23
+ sub r0, r0, r3, lsl #3
+.endif
+ vst1.64 {d0}, [r0,:64], r3
vst1.64 {d1}, [r0,:64], r3
vst1.64 {d2}, [r0,:64], r3
vst1.64 {d3}, [r0,:64], r3
mov lr, r10
bx lr
- .endfunc
+endfunc
+ .endm
+
+ h264_qpel8_hv_lowpass_l2 put
+ h264_qpel8_hv_lowpass_l2 avg
-function put_h264_qpel16_hv_lowpass_neon
+ .macro h264_qpel16_hv type
+function \type\()_h264_qpel16_hv_lowpass_neon
mov r9, lr
- bl put_h264_qpel8_hv_lowpass_neon
+ bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #2
- bl put_h264_qpel8_hv_lowpass_neon
+ bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
sub r0, r0, r2, lsl #4
add r0, r0, #8
- bl put_h264_qpel8_hv_lowpass_neon
+ bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r9
- b put_h264_qpel8_hv_lowpass_neon
- .endfunc
+ b \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
-function put_h264_qpel16_hv_lowpass_l2_neon
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
mov r9, lr
sub r2, r4, #256
- bl put_h264_qpel8_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #2
- bl put_h264_qpel8_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
sub r0, r0, r3, lsl #4
add r0, r0, #8
- bl put_h264_qpel8_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #2
mov lr, r9
- b put_h264_qpel8_hv_lowpass_l2_neon
- .endfunc
+ b \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+ .endm
-function ff_put_h264_qpel8_mc10_neon, export=1
+ h264_qpel16_hv put
+ h264_qpel16_hv avg
+
+ .macro h264_qpel8 type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
lowpass_const r3
mov r3, r1
sub r1, r1, #2
mov ip, #8
- b put_h264_qpel8_h_lowpass_l2_neon
- .endfunc
+ b \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
-function ff_put_h264_qpel8_mc20_neon, export=1
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
lowpass_const r3
sub r1, r1, #2
mov r3, r2
mov ip, #8
- b put_h264_qpel8_h_lowpass_neon
- .endfunc
+ b \type\()_h264_qpel8_h_lowpass_neon
+endfunc
-function ff_put_h264_qpel8_mc30_neon, export=1
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
lowpass_const r3
add r3, r1, #1
sub r1, r1, #2
mov ip, #8
- b put_h264_qpel8_h_lowpass_l2_neon
- .endfunc
+ b \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
-function ff_put_h264_qpel8_mc01_neon, export=1
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
push {lr}
mov ip, r1
-put_h264_qpel8_mc01:
+\type\()_h264_qpel8_mc01:
lowpass_const r3
mov r3, r2
sub r1, r1, r2, lsl #1
vpush {d8-d15}
- bl put_h264_qpel8_v_lowpass_l2_neon
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
pop {pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel8_mc11_neon, export=1
- push {r0, r1, r2, lr}
-put_h264_qpel8_mc11:
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+ push {r0, r1, r11, lr}
+\type\()_h264_qpel8_mc11:
lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
sub sp, sp, #64
mov r0, sp
sub r1, r1, #2
mov ip, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
- ldrd r0, [sp, #128]
+ ldrd r0, [r11]
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #8
- bl put_h264_qpel8_v_lowpass_l2_neon
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
- add sp, sp, #76
- pop {pc}
- .endfunc
+ add sp, r11, #8
+ pop {r11, pc}
+endfunc
-function ff_put_h264_qpel8_mc21_neon, export=1
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
push {r0, r1, r4, r10, r11, lr}
-put_h264_qpel8_mc21:
+\type\()_h264_qpel8_mc21:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
sub r1, r1, #2
mov r3, r2
sub r2, r4, #64
- bl put_h264_qpel8_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
pop {r4, r10, r11, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel8_mc31_neon, export=1
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
add r1, r1, #1
- push {r0, r1, r2, lr}
+ push {r0, r1, r11, lr}
sub r1, r1, #1
- b put_h264_qpel8_mc11
- .endfunc
+ b \type\()_h264_qpel8_mc11
+endfunc
-function ff_put_h264_qpel8_mc02_neon, export=1
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
push {lr}
lowpass_const r3
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
- bl put_h264_qpel8_v_lowpass_neon
+ bl \type\()_h264_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel8_mc12_neon, export=1
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
push {r0, r1, r4, r10, r11, lr}
-put_h264_qpel8_mc12:
+\type\()_h264_qpel8_mc12:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
sub r1, r1, r3, lsl #1
sub r1, r1, #2
sub r2, r4, #64
- bl put_h264_qpel8_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
pop {r4, r10, r11, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel8_mc22_neon, export=1
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
push {r4, r10, r11, lr}
mov r11, sp
bic sp, sp, #15
sub sp, sp, #(16*12)
mov r4, sp
vpush {d8-d15}
- bl put_h264_qpel8_hv_lowpass_neon
+ bl \type\()_h264_qpel8_hv_lowpass_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel8_mc32_neon, export=1
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
push {r0, r1, r4, r10, r11, lr}
add r1, r1, #1
- b put_h264_qpel8_mc12
- .endfunc
+ b \type\()_h264_qpel8_mc12
+endfunc
-function ff_put_h264_qpel8_mc03_neon, export=1
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
push {lr}
add ip, r1, r2
- b put_h264_qpel8_mc01
- .endfunc
+ b \type\()_h264_qpel8_mc01
+endfunc
-function ff_put_h264_qpel8_mc13_neon, export=1
- push {r0, r1, r2, lr}
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+ push {r0, r1, r11, lr}
add r1, r1, r2
- b put_h264_qpel8_mc11
- .endfunc
+ b \type\()_h264_qpel8_mc11
+endfunc
-function ff_put_h264_qpel8_mc23_neon, export=1
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
push {r0, r1, r4, r10, r11, lr}
add r1, r1, r2
- b put_h264_qpel8_mc21
- .endfunc
+ b \type\()_h264_qpel8_mc21
+endfunc
-function ff_put_h264_qpel8_mc33_neon, export=1
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
add r1, r1, #1
- push {r0, r1, r2, lr}
+ push {r0, r1, r11, lr}
add r1, r1, r2
sub r1, r1, #1
- b put_h264_qpel8_mc11
- .endfunc
+ b \type\()_h264_qpel8_mc11
+endfunc
+ .endm
-function ff_put_h264_qpel16_mc10_neon, export=1
+ h264_qpel8 put
+ h264_qpel8 avg
+
+ .macro h264_qpel16 type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
lowpass_const r3
mov r3, r1
sub r1, r1, #2
- b put_h264_qpel16_h_lowpass_l2_neon
- .endfunc
+ b \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
-function ff_put_h264_qpel16_mc20_neon, export=1
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
lowpass_const r3
sub r1, r1, #2
mov r3, r2
- b put_h264_qpel16_h_lowpass_neon
- .endfunc
+ b \type\()_h264_qpel16_h_lowpass_neon
+endfunc
-function ff_put_h264_qpel16_mc30_neon, export=1
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
lowpass_const r3
add r3, r1, #1
sub r1, r1, #2
- b put_h264_qpel16_h_lowpass_l2_neon
- .endfunc
+ b \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
-function ff_put_h264_qpel16_mc01_neon, export=1
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
push {r4, lr}
mov ip, r1
-put_h264_qpel16_mc01:
+\type\()_h264_qpel16_mc01:
lowpass_const r3
mov r3, r2
sub r1, r1, r2, lsl #1
vpush {d8-d15}
- bl put_h264_qpel16_v_lowpass_l2_neon
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
pop {r4, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel16_mc11_neon, export=1
- push {r0, r1, r4, lr}
-put_h264_qpel16_mc11:
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+ push {r0, r1, r4, r11, lr}
+\type\()_h264_qpel16_mc11:
lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
sub sp, sp, #256
mov r0, sp
sub r1, r1, #2
mov r3, #16
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon
- add r0, sp, #256
- ldrd r0, [r0, #64]
+ ldrd r0, [r11]
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #16
- bl put_h264_qpel16_v_lowpass_l2_neon
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
- add sp, sp, #(256+8)
- pop {r4, pc}
- .endfunc
+ add sp, r11, #8
+ pop {r4, r11, pc}
+endfunc
-function ff_put_h264_qpel16_mc21_neon, export=1
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
-put_h264_qpel16_mc21:
+\type\()_h264_qpel16_mc21:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
- bl put_h264_qpel16_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
pop {r4-r5, r9-r11, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel16_mc31_neon, export=1
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
add r1, r1, #1
- push {r0, r1, r4, lr}
+ push {r0, r1, r4, r11, lr}
sub r1, r1, #1
- b put_h264_qpel16_mc11
- .endfunc
+ b \type\()_h264_qpel16_mc11
+endfunc
-function ff_put_h264_qpel16_mc02_neon, export=1
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
push {r4, lr}
lowpass_const r3
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
- bl put_h264_qpel16_v_lowpass_neon
+ bl \type\()_h264_qpel16_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel16_mc12_neon, export=1
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
-put_h264_qpel16_mc12:
+\type\()_h264_qpel16_mc12:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
sub r1, r1, r3, lsl #1
sub r1, r1, #2
mov r2, r3
- bl put_h264_qpel16_hv_lowpass_l2_neon
+ bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
pop {r4-r5, r9-r11, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel16_mc22_neon, export=1
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
push {r4, r9-r11, lr}
lowpass_const r3
mov r11, sp
sub sp, sp, #(16*12)
mov r4, sp
vpush {d8-d15}
- bl put_h264_qpel16_hv_lowpass_neon
+ bl \type\()_h264_qpel16_hv_lowpass_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r9-r11, pc}
- .endfunc
+endfunc
-function ff_put_h264_qpel16_mc32_neon, export=1
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
add r1, r1, #1
- b put_h264_qpel16_mc12
- .endfunc
+ b \type\()_h264_qpel16_mc12
+endfunc
-function ff_put_h264_qpel16_mc03_neon, export=1
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
push {r4, lr}
add ip, r1, r2
- b put_h264_qpel16_mc01
- .endfunc
+ b \type\()_h264_qpel16_mc01
+endfunc
-function ff_put_h264_qpel16_mc13_neon, export=1
- push {r0, r1, r4, lr}
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+ push {r0, r1, r4, r11, lr}
add r1, r1, r2
- b put_h264_qpel16_mc11
- .endfunc
+ b \type\()_h264_qpel16_mc11
+endfunc
-function ff_put_h264_qpel16_mc23_neon, export=1
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
add r1, r1, r2
- b put_h264_qpel16_mc21
- .endfunc
+ b \type\()_h264_qpel16_mc21
+endfunc
-function ff_put_h264_qpel16_mc33_neon, export=1
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
add r1, r1, #1
- push {r0, r1, r4, lr}
+ push {r0, r1, r4, r11, lr}
add r1, r1, r2
sub r1, r1, #1
- b put_h264_qpel16_mc11
- .endfunc
+ b \type\()_h264_qpel16_mc11
+endfunc
+ .endm
+
+ h264_qpel16 put
+ h264_qpel16 avg
@ Biweighted prediction
biweight_\w vmlsl.u8, vmlsl.u8
40: rsb r5, r5, #0
biweight_\w vmlsl.u8, vmlal.u8
- .endfunc
+endfunc
.endm
.macro biweight_entry w, h, b=1
.if \b
b biweight_h264_pixels_\w\()_neon
.endif
- .endfunc
+endfunc
.endm
biweight_entry 16, 8
biweight_entry 4, 2
biweight_entry 4, 4, b=0
biweight_func 4
+
+@ Weighted prediction
+
+ .macro weight_16 add
+ vdup.8 d0, r3
+1: subs ip, ip, #2
+ vld1.8 {d20-d21},[r0,:128], r1
+ vmull.u8 q2, d0, d20
+ pld [r0]
+ vmull.u8 q3, d0, d21
+ vld1.8 {d28-d29},[r0,:128], r1
+ vmull.u8 q12, d0, d28
+ pld [r0]
+ vmull.u8 q13, d0, d29
+ \add q2, q8, q2
+ vrshl.s16 q2, q2, q9
+ \add q3, q8, q3
+ vrshl.s16 q3, q3, q9
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d5, q3
+ \add q12, q8, q12
+ vrshl.s16 q12, q12, q9
+ \add q13, q8, q13
+ vrshl.s16 q13, q13, q9
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vst1.8 {d4- d5}, [r4,:128], r1
+ vst1.8 {d24-d25},[r4,:128], r1
+ bne 1b
+ pop {r4, pc}
+ .endm
+
+ .macro weight_8 add
+ vdup.8 d0, r3
+1: subs ip, ip, #2
+ vld1.8 {d4},[r0,:64], r1
+ vmull.u8 q1, d0, d4
+ pld [r0]
+ vld1.8 {d6},[r0,:64], r1
+ vmull.u8 q10, d0, d6
+ \add q1, q8, q1
+ pld [r0]
+ vrshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ \add q10, q8, q10
+ vrshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vst1.8 {d2},[r4,:64], r1
+ vst1.8 {d4},[r4,:64], r1
+ bne 1b
+ pop {r4, pc}
+ .endm
+
+ .macro weight_4 add
+ vdup.8 d0, r3
+ vmov q1, q8
+ vmov q10, q8
+1: subs ip, ip, #4
+ vld1.32 {d4[0]},[r0,:32], r1
+ vld1.32 {d4[1]},[r0,:32], r1
+ vmull.u8 q1, d0, d4
+ pld [r0]
+ blt 2f
+ vld1.32 {d6[0]},[r0,:32], r1
+ vld1.32 {d6[1]},[r0,:32], r1
+ vmull.u8 q10, d0, d6
+ pld [r0]
+ \add q1, q8, q1
+ vrshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ \add q10, q8, q10
+ vrshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vmov q10, q8
+ vst1.32 {d2[0]},[r4,:32], r1
+ vst1.32 {d2[1]},[r4,:32], r1
+ vmov q1, q8
+ vst1.32 {d4[0]},[r4,:32], r1
+ vst1.32 {d4[1]},[r4,:32], r1
+ bne 1b
+ pop {r4, pc}
+2: \add q1, q8, q1
+ vrshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vst1.32 {d2[0]},[r4,:32], r1
+ vst1.32 {d2[1]},[r4,:32], r1
+ pop {r4, pc}
+ .endm
+
+ .macro weight_func w
+function weight_h264_pixels_\w\()_neon
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ cmp r2, #1
+ lsl r4, r4, r2
+ vdup.16 q8, r4
+ mov r4, r0
+ ble 20f
+ rsb lr, r2, #1
+ vdup.16 q9, lr
+ cmp r3, #0
+ blt 10f
+ weight_\w vhadd.s16
+10: rsb r3, r3, #0
+ weight_\w vhsub.s16
+20: rsb lr, r2, #0
+ vdup.16 q9, lr
+ cmp r3, #0
+ blt 10f
+ weight_\w vadd.s16
+10: rsb r3, r3, #0
+ weight_\w vsub.s16
+endfunc
+ .endm
+
+ .macro weight_entry w, h, b=1
+function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
+ mov ip, #\h
+.if \b
+ b weight_h264_pixels_\w\()_neon
+.endif
+endfunc
+ .endm
+
+ weight_entry 16, 8
+ weight_entry 16, 16, b=0
+ weight_func 16
+
+ weight_entry 8, 16
+ weight_entry 8, 4
+ weight_entry 8, 8, b=0
+ weight_func 8
+
+ weight_entry 4, 8
+ weight_entry 4, 2
+ weight_entry 4, 4, b=0
+ weight_func 4