else
#endif
#if defined(__arm__)
- if( chroma->pixel_size == 1 && vlc_CPU_ARM_NEON() )
- p_sys->pf_merge = merge8_arm_neon;
+ if( vlc_CPU_ARM_NEON() )
+ p_sys->pf_merge =
+ (chroma->pixel_size == 1) ? merge8_arm_neon : merge16_arm_neon;
else
- if( chroma->pixel_size == 1 && vlc_CPU_ARMv6() )
- p_sys->pf_merge = merge8_armv6;
+ if( vlc_CPU_ARMv6() )
+ p_sys->pf_merge =
+ (chroma->pixel_size == 1) ? merge8_armv6 : merge16_armv6;
else
#endif
{
* ARM NEON routine to blend pixels from two picture lines.
*/
void merge8_arm_neon (void *, const void *, const void *, size_t);
+void merge16_arm_neon (void *, const void *, const void *, size_t);
/**
* ARMv6 SIMD routine to blend pixels from two picture lines.
*/
void merge8_armv6 (void *, const void *, const void *, size_t);
+void merge16_armv6 (void *, const void *, const void *, size_t);
#endif
/*****************************************************************************
vst1.u8 {q0}, [DEST,:128]!
bx lr
+ .align 2
+ .global merge16_arm_neon
+ .type merge16_arm_neon, %function
+merge16_arm_neon:
+ cmp SIZE, #64
+ blo 2f
+1:
+ pld [SRC1, #64]
+ vld1.u16 {q0-q1}, [SRC1,:128]!
+ pld [SRC2, #64]
+ vld1.u16 {q8-q9}, [SRC2,:128]!
+ vhadd.u16 q0, q0, q8
+ sub SIZE, SIZE, #64
+ vld1.u16 {q2-q3}, [SRC1,:128]!
+ vhadd.u16 q1, q1, q9
+ vld1.u16 {q10-q11}, [SRC2,:128]!
+ vhadd.u16 q2, q2, q10
+ cmp SIZE, #64
+ vhadd.u16 q3, q3, q11
+ vst1.u16 {q0-q1}, [DEST,:128]!
+ vst1.u16 {q2-q3}, [DEST,:128]!
+ bhs 1b
+2:
+ cmp SIZE, #32
+ blo 3f
+ vld1.u16 {q0-q1}, [SRC1,:128]!
+ sub SIZE, SIZE, #32
+ vld1.u16 {q8-q9}, [SRC2,:128]!
+ vhadd.u16 q0, q0, q8
+ vhadd.u16 q1, q1, q9
+ vst1.u16 {q0-q1}, [DEST,:128]!
+3:
+ cmp SIZE, #16
+ bxlo lr
+ vld1.u16 {q0}, [SRC1,:128]!
+ sub SIZE, SIZE, #16
+ vld1.u16 {q8}, [SRC2,:128]!
+ vhadd.u16 q0, q0, q8
+ vst1.u16 {q0}, [DEST,:128]!
+ bx lr
+
.align 2
.global merge8_armv6
.type merge8_armv6, %function
stm DEST!, {r6-r7}
popeq {r4-r9,pc}
b 1b
+
+ .align 2
+ .global merge16_armv6
+ .type merge16_armv6, %function
+merge16_armv6:
+ push {r4-r9,lr}
+1:
+ pld [SRC1, #64]
+ ldm SRC1!, {r4-r5}
+ pld [SRC2, #64]
+ ldm SRC2!, {r8-r9}
+ subs SIZE, SIZE, #16
+ uhadd16 r4, r4, r8
+ ldm SRC1!, {r6-r7}
+ uhadd16 r5, r5, r9
+ ldm SRC2!, {ip,lr}
+ uhadd16 r6, r6, ip
+ stm DEST!, {r4-r5}
+ uhadd16 r7, r7, lr
+ stm DEST!, {r6-r7}
+ popeq {r4-r9,pc}
+ b 1b