C: Quovodis
D: ActiveX control
D: Safari/Firefox plugin for MacOS X
+D: Direct3D Video output
+D: SSE2 chroma converters
+D: improved MMX chroma converters
S: Ireland
N: Alexis Guillard
was previously part of the mosaic module.
* Fix random characters problem in RSS filter.
* Add rotate-deciangle for more precision on rotate filter
+ * Support for Intel SSE2 intruction set in chroma converters
+ * Improved use of Intel MMX intruction set in chroma converters
Audio output
* Replay gain support.
#define SSE2_YUV420_YUYV_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
- xmm0 = _mm_load_si128((__m128i *)p_y1); \
- xmm3 = _mm_load_si128((__m128i *)p_y2); \
+ xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
+ xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
#define SSE2_YUV420_YVYU_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
- xmm0 = _mm_load_si128((__m128i *)p_y1); \
- xmm3 = _mm_load_si128((__m128i *)p_y2); \
+ xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
+ xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
#define SSE2_YUV420_UYVY_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
- xmm0 = _mm_load_si128((__m128i *)p_y1); \
- xmm3 = _mm_load_si128((__m128i *)p_y2); \
+ xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
+ xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
int i_x, i_y;
+ const int i_source_margin = p_source->p[0].i_pitch
+ - p_source->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_source->p[1].i_pitch
+ - p_source->p[1].i_visible_pitch;
+ const int i_dest_margin = p_dest->p->i_pitch
+ - p_dest->p->i_visible_pitch;
+
+#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
+
+ if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+ ((int)p_line|(int)p_y))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = p_vout->render.i_height ; i_y-- ; )
+ {
+ p_line -= 2 * p_dest->p->i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV422_UYVY_ALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV422_UYVY( p_line, p_y, p_u, p_v );
+ }
+ p_y += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line += i_dest_margin;
+ }
+ }
+ else {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = p_vout->render.i_height ; i_y-- ; )
+ {
+ p_line -= 2 * p_dest->p->i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV422_UYVY_UNALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV422_UYVY( p_line, p_y, p_u, p_v );
+ }
+ p_y += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line += i_dest_margin;
+ }
+ }
+ SSE2_END;
+
+#else
+
for( i_y = p_vout->render.i_height ; i_y-- ; )
{
for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
MMX_CALL( MMX_YUV422_UYVY );
#endif
}
+ p_y += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line += i_dest_margin;
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END;
#endif
+
+#endif
}
/*****************************************************************************
#include <emmintrin.h>
+#define SSE2_CALL(SSE2_INSTRUCTIONS) \
+ do { \
+ __m128i xmm0, xmm1, xmm2; \
+ SSE2_INSTRUCTIONS \
+ p_line += 32; p_y += 16; \
+ p_u += 8; p_v += 8; \
+ } while(0)
#define SSE2_END _mm_sfence()
+#define SSE2_YUV422_YUYV_ALIGNED \
+ xmm0 = _mm_load_si128((__m128i *)p_y); \
+ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
+ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
+ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
+ xmm2 = xmm0; \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
+ _mm_stream_si128((__m128i*)(p_line), xmm2); \
+ xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
+ _mm_stream_si128((__m128i*)(p_line+16), xmm0);
+
+#define SSE2_YUV422_YUYV_UNALIGNED \
+ xmm0 = _mm_loadu_si128((__m128i *)p_y); \
+ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
+ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
+ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
+ xmm2 = xmm0; \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
+ _mm_storeu_si128((__m128i*)(p_line), xmm2); \
+ xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
+ _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
+
+#define SSE2_YUV422_YVYU_ALIGNED \
+ xmm0 = _mm_load_si128((__m128i *)p_y); \
+ xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
+ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
+ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
+ xmm2 = xmm0; \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
+ _mm_stream_si128((__m128i*)(p_line), xmm2); \
+ xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
+ _mm_stream_si128((__m128i*)(p_line+16), xmm0);
+
+#define SSE2_YUV422_YVYU_UNALIGNED \
+ xmm0 = _mm_loadu_si128((__m128i *)p_y); \
+ xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
+ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
+ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
+ xmm2 = xmm0; \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
+ _mm_storeu_si128((__m128i*)(p_line), xmm2); \
+ xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
+ _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
+
+#define SSE2_YUV422_UYVY_ALIGNED \
+ xmm0 = _mm_load_si128((__m128i *)p_y); \
+ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
+ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
+ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
+ xmm2 = xmm1; \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
+ _mm_stream_si128((__m128i*)(p_line), xmm2); \
+ xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
+ _mm_stream_si128((__m128i*)(p_line+16), xmm1);
+
+#define SSE2_YUV422_UYVY_UNALIGNED \
+ xmm0 = _mm_loadu_si128((__m128i *)p_y); \
+ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
+ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
+ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
+ xmm2 = xmm1; \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
+ _mm_storeu_si128((__m128i*)(p_line), xmm2); \
+ xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
+ _mm_storeu_si128((__m128i*)(p_line+16), xmm1);
+
#endif
#endif