From: Damien Fouilleul Date: Sat, 16 Jun 2007 22:13:47 +0000 (+0000) Subject: video_chroma: a few SSE2 fixes X-Git-Tag: 0.9.0-test0~7030 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=f4f90e674b23ba5a949d0bffd942451685d31907;p=vlc video_chroma: a few SSE2 fixes --- diff --git a/modules/video_chroma/i420_rgb16.c b/modules/video_chroma/i420_rgb16.c index 27f0715a49..f9fc4fb02c 100644 --- a/modules/video_chroma/i420_rgb16.c +++ b/modules/video_chroma/i420_rgb16.c @@ -448,12 +448,6 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { @@ -526,6 +520,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_buffer = b_hscale ? p_buffer_start : p_pic; } } + + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ::: "memory" ); +#else + _mm_sfence(); +#endif + #else // defined (MODULE_NAME_IS_i420_rgb_mmx) if( p_vout->render.i_width & 7 ) @@ -755,12 +757,6 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { @@ -833,6 +829,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_buffer = b_hscale ? p_buffer_start : p_pic; } } + + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ::: "memory" ); +#else + _mm_sfence(); +#endif + #else // defined (MODULE_NAME_IS_i420_rgb_mmx) if( p_vout->render.i_width & 7 ) @@ -1179,12 +1183,6 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { @@ -1263,7 +1261,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, } } + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ::: "memory" ); #else + _mm_sfence(); +#endif + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) if( p_vout->render.i_width & 7 ) { @@ -1500,12 +1505,6 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { diff --git a/modules/video_chroma/i420_rgb_mmx.h b/modules/video_chroma/i420_rgb_mmx.h index 42b33d412c..85aa9094ab 100644 --- a/modules/video_chroma/i420_rgb_mmx.h +++ b/modules/video_chroma/i420_rgb_mmx.h @@ -61,7 +61,6 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " #define SSE2_INIT_16_ALIGNED " \n\ -prefetcht1 (%3) # cache preload for image \n\ movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ pxor %%xmm4, %%xmm4 # zero mm4 \n\ @@ -69,11 +68,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " #define SSE2_INIT_16_UNALIGNED " \n\ -prefetcht1 (%3) # cache preload for image \n\ movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ pxor %%xmm4, %%xmm4 # zero mm4 \n\ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ " #define MMX_INTRINSICS_INIT_16 \ @@ -91,11 +90,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ xmm6 = _mm_load_si128((__m128i *)p_y); \ #define SSE2_INTRINSICS_INIT_16_UNALIGNED \ - _mm_prefetch(p_buffer, _MM_HINT_T1); \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm4 = _mm_setzero_si128(); \ xmm6 = _mm_loadu_si128((__m128i *)p_y); \ + _mm_prefetch(p_buffer, _MM_HINT_NTA); \ #define MMX_INIT_16_GRAY " \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ @@ -118,11 +117,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " #define SSE2_INIT_32_UNALIGNED " \n\ -prefetcht1 (%3) # cache preload for image \n\ movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ pxor %%xmm4, %%xmm4 # zero mm4 \n\ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ " #define MMX_INTRINSICS_INIT_32 \ @@ -141,11 +140,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ xmm6 = _mm_load_si128((__m128i *)p_y); \ #define SSE2_INTRINSICS_INIT_32_UNALIGNED \ - _mm_prefetch(p_buffer, _MM_HINT_T1); \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm4 = _mm_setzero_si128(); \ xmm6 = _mm_loadu_si128((__m128i *)p_y); \ + _mm_prefetch(p_buffer, _MM_HINT_NTA); \ /* * Do the multiply part of the conversion for even and odd pixels, @@ -260,7 +259,7 @@ pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ #define SSE2_INTRINSICS_YUV_MUL \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ - xmm5 = _mm_set1_epi32(0x80808080UL); \ + xmm5 = _mm_set1_epi32(0x00800080UL); \ xmm0 = _mm_subs_epi16(xmm0, xmm5); \ xmm1 = _mm_subs_epi16(xmm1, xmm5); \ xmm0 = _mm_slli_epi16(xmm0, 3); \ @@ -1001,7 +1000,7 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ xmm5 = xmm3; \ xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \ - xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ _mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \ #define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \ @@ -1021,6 +1020,6 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ xmm5 = xmm3; \ xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ _mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \ - xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ _mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \