]> git.sesse.net Git - vlc/commitdiff
deinterlace: preliminary merge ARM NEON optimization
authorRémi Denis-Courmont <remi@remlab.net>
Sat, 12 Sep 2009 16:00:49 +0000 (19:00 +0300)
committerRémi Denis-Courmont <remi@remlab.net>
Sat, 12 Sep 2009 16:03:25 +0000 (19:03 +0300)
(It wouldn't hurt to make VLC align pixel lines more strongly, but I am
 not sure if this is feasible)

modules/video_filter/deinterlace.c

index 7a92d939689648856494256205310c6e469b4ac7..1de201e3991593c2636b1f41db58ec396deaf593 100644 (file)
@@ -93,6 +93,9 @@ static void EndMMX       ( void );
 #if defined(CAN_COMPILE_3DNOW)
 static void End3DNow     ( void );
 #endif
+#if defined __ARM_NEON__
+static void MergeNEON (void *, const void *, const void *, size_t);
+#endif
 
 static void SetFilterMethod( vout_thread_t *p_vout, const char *psz_method );
 static vout_thread_t *SpawnRealVout( vout_thread_t *p_vout );
@@ -244,6 +247,14 @@ static int Create( vlc_object_t *p_this )
         p_sys->pf_end_merge = End3DNow;
     }
     else
+#endif
+#if defined __ARM_NEON__
+    if( vlc_CPU() & CPU_CAPABILITY_NEON )
+    {
+        p_sys->pf_merge = MergeNEON;
+        p_sys->pf_end_merge = NULL;
+    }
+    else
 #endif
     {
         p_sys->pf_merge = MergeGeneric;
@@ -1119,6 +1130,64 @@ static void MergeAltivec( void *_p_dest, const void *_p_s1,
 }
 #endif
 
+#ifdef __ARM_NEON__
+static void MergeNEON (void *restrict out, const void *in1,
+                       const void *in2, size_t n)
+{
+    uint8_t *outp = out;
+    const uint8_t *in1p = in1;
+    const uint8_t *in2p = in2;
+    size_t mis = ((uintptr_t)outp) & 15;
+
+    if (mis)
+    {
+        MergeGeneric (outp, in1p, in2p, mis);
+        outp += mis;
+        in1p += mis;
+        in2p += mis;
+        n -= mis;
+    }
+
+    uint8_t *end = outp + (n & ~15);
+
+    if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
+        while (outp < end)
+            asm volatile (
+                "vld1.u8  {q0-q1}, [%[in1]]!\n"
+                "vld1.u8  {q2-q3}, [%[in2]]!\n"
+                "vhadd.u8 q4, q0, q2\n"
+                "vld1.u8  {q6-q7}, [%[in1]]!\n"
+                "vhadd.u8 q5, q1, q3\n"
+                "vld1.u8  {q8-q9}, [%[in2]]!\n"
+                "vhadd.u8 q10, q6, q8\n"
+                "vhadd.u8 q11, q7, q9\n"
+                "vst1.u8  {q4-q5}, [%[out],:128]!\n"
+                "vst1.u8  {q10-q11}, [%[out],:128]!\n"
+                : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
+                :
+                : "q0", "q1", "q2", "memory");
+    else
+         while (outp < end)
+            asm volatile (
+                "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
+                "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
+                "vhadd.u8 q4, q0, q2\n"
+                "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
+                "vhadd.u8 q5, q1, q3\n"
+                "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
+                "vhadd.u8 q10, q6, q8\n"
+                "vhadd.u8 q11, q7, q9\n"
+                "vst1.u8  {q4-q5}, [%[out],:128]!\n"
+                "vst1.u8  {q10-q11}, [%[out],:128]!\n"
+                : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
+                :
+                : "q0", "q1", "q2", "memory");
+    n &= 15;
+    if (n)
+        MergeGeneric (outp, in1p, in2p, n);
+}
+#endif
+
 /*****************************************************************************
  * RenderX: This algo works on a 8x8 block basic, it copies the top field
  * and apply a process to recreate the bottom field :