From: Damien Fouilleul <damienf@videolan.org>
Date: Wed, 15 Aug 2007 16:15:45 +0000 (+0000)
Subject: video chromas: finalize SSE2 improvements
X-Git-Tag: 0.9.0-test0~6521
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=df3b5eec76004d2dcfbabc36ac65f64755bdbc33;p=vlc

video chromas: finalize SSE2 improvements
---

diff --git a/AUTHORS b/AUTHORS
index cf8e02eef7..d9595ecdb4 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -190,6 +190,9 @@ E: Damien.Fouilleul@laposte.net
 C: Quovodis
 D: ActiveX control
 D: Safari/Firefox plugin for MacOS X
+D: Direct3D Video output
+D: SSE2 chroma converters
+D: improved MMX chroma converters
 S: Ireland
 
 N: Alexis Guillard
diff --git a/NEWS b/NEWS
index d0cf308cd4..a12068786d 100644
--- a/NEWS
+++ b/NEWS
@@ -81,6 +81,8 @@ Video output and filters:
    was previously part of the mosaic module.
  * Fix random characters problem in RSS filter.
  * Add rotate-deciangle for more precision on rotate filter
+ * Support for Intel SSE2 intruction set in chroma converters
+ * Improved use of Intel MMX intruction set in chroma converters
 
 Audio output
  * Replay gain support.
diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h
index 1f35a3061f..a630157dc2 100644
--- a/modules/video_chroma/i420_yuy2.h
+++ b/modules/video_chroma/i420_yuy2.h
@@ -366,8 +366,8 @@ movdqu    %%xmm1, 16(%1)  # Store high UYVY                               \n\
 #define SSE2_YUV420_YUYV_UNALIGNED                  \
     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         \
     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         \
-    xmm0 = _mm_load_si128((__m128i *)p_y1);         \
-    xmm3 = _mm_load_si128((__m128i *)p_y2);         \
+    xmm0 = _mm_loadu_si128((__m128i *)p_y1);        \
+    xmm3 = _mm_loadu_si128((__m128i *)p_y2);        \
     _mm_prefetch(p_line1, _MM_HINT_NTA);            \
     _mm_prefetch(p_line2, _MM_HINT_NTA);            \
     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
@@ -402,8 +402,8 @@ movdqu    %%xmm1, 16(%1)  # Store high UYVY                               \n\
 #define SSE2_YUV420_YVYU_UNALIGNED                  \
     xmm1 = _mm_loadl_epi64((__m128i *)p_v);         \
     xmm2 = _mm_loadl_epi64((__m128i *)p_u);         \
-    xmm0 = _mm_load_si128((__m128i *)p_y1);         \
-    xmm3 = _mm_load_si128((__m128i *)p_y2);         \
+    xmm0 = _mm_loadu_si128((__m128i *)p_y1);        \
+    xmm3 = _mm_loadu_si128((__m128i *)p_y2);        \
     _mm_prefetch(p_line1, _MM_HINT_NTA);            \
     _mm_prefetch(p_line2, _MM_HINT_NTA);            \
     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
@@ -439,8 +439,8 @@ movdqu    %%xmm1, 16(%1)  # Store high UYVY                               \n\
 #define SSE2_YUV420_UYVY_UNALIGNED                  \
     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         \
     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         \
-    xmm0 = _mm_load_si128((__m128i *)p_y1);         \
-    xmm3 = _mm_load_si128((__m128i *)p_y2);         \
+    xmm0 = _mm_loadu_si128((__m128i *)p_y1);        \
+    xmm3 = _mm_loadu_si128((__m128i *)p_y2);        \
     _mm_prefetch(p_line1, _MM_HINT_NTA);            \
     _mm_prefetch(p_line2, _MM_HINT_NTA);            \
     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
diff --git a/modules/video_chroma/i422_yuy2.c b/modules/video_chroma/i422_yuy2.c
index c255079f31..84eaf90aac 100644
--- a/modules/video_chroma/i422_yuy2.c
+++ b/modules/video_chroma/i422_yuy2.c
@@ -442,6 +442,61 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 
     int i_x, i_y;
 
+    const int i_source_margin = p_source->p[0].i_pitch
+                                 - p_source->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_source->p[1].i_pitch
+                                 - p_source->p[1].i_visible_pitch;
+    const int i_dest_margin = p_dest->p->i_pitch
+                               - p_dest->p->i_visible_pitch;
+
+#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
+
+    if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((int)p_line|(int)p_y))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = p_vout->render.i_height ; i_y-- ; )
+        {
+            p_line -= 2 * p_dest->p->i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV422_UYVY_ALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV422_UYVY( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    else {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = p_vout->render.i_height ; i_y-- ; )
+        {
+            p_line -= 2 * p_dest->p->i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV422_UYVY_UNALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV422_UYVY( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    SSE2_END;
+
+#else
+
     for( i_y = p_vout->render.i_height ; i_y-- ; )
     {
         for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
@@ -457,12 +512,18 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
             MMX_CALL( MMX_YUV422_UYVY );
 #endif
         }
+        p_y += i_source_margin;
+        p_u += i_source_margin_c;
+        p_v += i_source_margin_c;
+        p_line += i_dest_margin;
     }
 #if defined (MODULE_NAME_IS_i422_yuy2_mmx)
     MMX_END;
 #elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
     SSE2_END;
 #endif
+
+#endif
 }
 
 /*****************************************************************************
diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h
index 1b0405bb27..68057c7c04 100644
--- a/modules/video_chroma/i422_yuy2.h
+++ b/modules/video_chroma/i422_yuy2.h
@@ -233,9 +233,82 @@ movdqu    %%xmm1, 16(%0)  # Store high UYVY                               \n\
 
 #include <emmintrin.h>
 
+#define SSE2_CALL(SSE2_INSTRUCTIONS)    \
+    do {                                \
+        __m128i xmm0, xmm1, xmm2;        \
+        SSE2_INSTRUCTIONS               \
+        p_line += 32; p_y += 16;        \
+        p_u += 8; p_v += 8;             \
+    } while(0)
 
 #define SSE2_END  _mm_sfence()
 
+#define SSE2_YUV422_YUYV_ALIGNED                \
+    xmm0 = _mm_load_si128((__m128i *)p_y);      \
+    xmm1 = _mm_loadl_epi64((__m128i *)p_u);     \
+    xmm2 = _mm_loadl_epi64((__m128i *)p_v);     \
+    xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);       \
+    xmm2 = xmm0;                                \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);       \
+    _mm_stream_si128((__m128i*)(p_line), xmm2); \
+    xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);       \
+    _mm_stream_si128((__m128i*)(p_line+16), xmm0);
+ 
+#define SSE2_YUV422_YUYV_UNALIGNED              \
+    xmm0 = _mm_loadu_si128((__m128i *)p_y);     \
+    xmm1 = _mm_loadl_epi64((__m128i *)p_u);     \
+    xmm2 = _mm_loadl_epi64((__m128i *)p_v);     \
+    xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);       \
+    xmm2 = xmm0;                                \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);       \
+    _mm_storeu_si128((__m128i*)(p_line), xmm2); \
+    xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);       \
+    _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
+ 
+#define SSE2_YUV422_YVYU_ALIGNED                \
+    xmm0 = _mm_load_si128((__m128i *)p_y);      \
+    xmm2 = _mm_loadl_epi64((__m128i *)p_u);     \
+    xmm1 = _mm_loadl_epi64((__m128i *)p_v);     \
+    xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);       \
+    xmm2 = xmm0;                                \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);       \
+    _mm_stream_si128((__m128i*)(p_line), xmm2); \
+    xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);       \
+    _mm_stream_si128((__m128i*)(p_line+16), xmm0);
+
+#define SSE2_YUV422_YVYU_UNALIGNED              \
+    xmm0 = _mm_loadu_si128((__m128i *)p_y);     \
+    xmm2 = _mm_loadl_epi64((__m128i *)p_u);     \
+    xmm1 = _mm_loadl_epi64((__m128i *)p_v);     \
+    xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);       \
+    xmm2 = xmm0;                                \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);       \
+    _mm_storeu_si128((__m128i*)(p_line), xmm2); \
+    xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);       \
+    _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
+
+#define SSE2_YUV422_UYVY_ALIGNED                \
+    xmm0 = _mm_load_si128((__m128i *)p_y);      \
+    xmm1 = _mm_loadl_epi64((__m128i *)p_u);     \
+    xmm2 = _mm_loadl_epi64((__m128i *)p_v);     \
+    xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);       \
+    xmm2 = xmm1;                                \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm0);       \
+    _mm_stream_si128((__m128i*)(p_line), xmm2); \
+    xmm1 = _mm_unpackhi_epi8(xmm1, xmm0);       \
+    _mm_stream_si128((__m128i*)(p_line+16), xmm1);
+
+#define SSE2_YUV422_UYVY_UNALIGNED              \
+    xmm0 = _mm_loadu_si128((__m128i *)p_y);     \
+    xmm1 = _mm_loadl_epi64((__m128i *)p_u);     \
+    xmm2 = _mm_loadl_epi64((__m128i *)p_v);     \
+    xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);       \
+    xmm2 = xmm1;                                \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm0);       \
+    _mm_storeu_si128((__m128i*)(p_line), xmm2); \
+    xmm1 = _mm_unpackhi_epi8(xmm1, xmm0);       \
+    _mm_storeu_si128((__m128i*)(p_line+16), xmm1);
+
 #endif
 
 #endif