]> git.sesse.net Git - vlc/commitdiff
deinterlace: ARM optimizations for 16-bits merge
authorRémi Denis-Courmont <remi@remlab.net>
Sat, 4 Aug 2012 16:09:16 +0000 (19:09 +0300)
committerRémi Denis-Courmont <remi@remlab.net>
Sat, 4 Aug 2012 16:09:16 +0000 (19:09 +0300)
modules/video_filter/deinterlace/deinterlace.c
modules/video_filter/deinterlace/merge.h
modules/video_filter/deinterlace/merge_arm.S

index 9d52f995b49f9926fb6189a283910b1cc6b544db..4cfb86a8771cb8ed9e8c6b2274267ccf66656c59 100644 (file)
@@ -657,11 +657,13 @@ int Open( vlc_object_t *p_this )
     else
 #endif
 #if defined(__arm__)
-    if( chroma->pixel_size == 1 && vlc_CPU_ARM_NEON() )
-        p_sys->pf_merge = merge8_arm_neon;
+    if( vlc_CPU_ARM_NEON() )
+        p_sys->pf_merge =
+            (chroma->pixel_size == 1) ? merge8_arm_neon : merge16_arm_neon;
     else
-    if( chroma->pixel_size == 1 && vlc_CPU_ARMv6() )
-        p_sys->pf_merge = merge8_armv6;
+    if( vlc_CPU_ARMv6() )
+        p_sys->pf_merge =
+            (chroma->pixel_size == 1) ? merge8_armv6 : merge16_armv6;
     else
 #endif
     {
index 04634dbb1880e113c08ed48ae56ed8dbab484291..ca0c61c828e2ad3393cb63fbb9e82f010878ea80 100644 (file)
@@ -163,11 +163,13 @@ void Merge16BitSSE2( void *, const void *, const void *, size_t );
  * ARM NEON routine to blend pixels from two picture lines.
  */
 void merge8_arm_neon (void *, const void *, const void *, size_t);
+void merge16_arm_neon (void *, const void *, const void *, size_t);
 
 /**
  * ARMv6 SIMD routine to blend pixels from two picture lines.
  */
 void merge8_armv6 (void *, const void *, const void *, size_t);
+void merge16_armv6 (void *, const void *, const void *, size_t);
 #endif
 
 /*****************************************************************************
index 80c652bc99953e9b368949e6d27b9b8b3eee7003..04dc068921051785ddc59baf98e3519ea7fb53bd 100644 (file)
@@ -71,6 +71,47 @@ merge8_arm_neon:
        vst1.u8         {q0},           [DEST,:128]!
        bx              lr
 
+       .align 2
+       .global merge16_arm_neon
+       .type   merge16_arm_neon, %function
+merge16_arm_neon:
+       cmp             SIZE,   #64
+       blo             2f
+1:
+       pld             [SRC1, #64]
+       vld1.u16        {q0-q1},        [SRC1,:128]!
+       pld             [SRC2, #64]
+       vld1.u16        {q8-q9},        [SRC2,:128]!
+       vhadd.u16       q0,     q0,     q8
+       sub             SIZE,   SIZE,   #64
+       vld1.u16        {q2-q3},        [SRC1,:128]!
+       vhadd.u16       q1,     q1,     q9
+       vld1.u16        {q10-q11},      [SRC2,:128]!
+       vhadd.u16       q2,     q2,     q10
+       cmp             SIZE,   #64
+       vhadd.u16       q3,     q3,     q11
+       vst1.u16        {q0-q1},        [DEST,:128]!
+       vst1.u16        {q2-q3},        [DEST,:128]!
+       bhs             1b
+2:
+       cmp             SIZE,   #32
+       blo             3f
+       vld1.u16        {q0-q1},        [SRC1,:128]!
+       sub             SIZE,   SIZE,   #32
+       vld1.u16        {q8-q9},        [SRC2,:128]!
+       vhadd.u16       q0,     q0,     q8
+       vhadd.u16       q1,     q1,     q9
+       vst1.u16        {q0-q1},        [DEST,:128]!
+3:
+       cmp             SIZE,   #16
+       bxlo            lr
+       vld1.u16        {q0},           [SRC1,:128]!
+       sub             SIZE,   SIZE,   #16
+       vld1.u16        {q8},           [SRC2,:128]!
+       vhadd.u16       q0,     q0,     q8
+       vst1.u16        {q0},           [DEST,:128]!
+       bx              lr
+
        .align 2
        .global merge8_armv6
        .type   merge8_armv6, %function
@@ -92,3 +133,25 @@ merge8_armv6:
        stm             DEST!,  {r6-r7}
        popeq           {r4-r9,pc}
        b               1b
+
+       .align 2
+       .global merge16_armv6
+       .type   merge16_armv6, %function
+merge16_armv6:
+       push            {r4-r9,lr}
+1:
+       pld             [SRC1, #64]
+       ldm             SRC1!,  {r4-r5}
+       pld             [SRC2, #64]
+       ldm             SRC2!,  {r8-r9}
+       subs            SIZE,   SIZE,   #16
+       uhadd16         r4,     r4,     r8
+       ldm             SRC1!,  {r6-r7}
+       uhadd16         r5,     r5,     r9
+       ldm             SRC2!,  {ip,lr}
+       uhadd16         r6,     r6,     ip
+       stm             DEST!,  {r4-r5}
+       uhadd16         r7,     r7,     lr
+       stm             DEST!,  {r6-r7}
+       popeq           {r4-r9,pc}
+       b               1b