From 5e4dc54cffa6708234ea25496e9704f2895a6f3a Mon Sep 17 00:00:00 2001 From: Damien Fouilleul Date: Thu, 2 Aug 2007 11:49:49 +0000 Subject: [PATCH 1/1] chromas: more SSE2/MMX fixes, added I420_RGBA conversion --- modules/video_chroma/i420_rgb.c | 3 +- modules/video_chroma/i420_rgb.h | 1 + modules/video_chroma/i420_rgb16.c | 239 ++++++++++++++ modules/video_chroma/i420_rgb_mmx.h | 467 +++++++++++++++++++--------- modules/video_chroma/i420_yuy2.h | 36 +-- 5 files changed, 575 insertions(+), 171 deletions(-) diff --git a/modules/video_chroma/i420_rgb.c b/modules/video_chroma/i420_rgb.c index ca772275e2..e920350409 100644 --- a/modules/video_chroma/i420_rgb.c +++ b/modules/video_chroma/i420_rgb.c @@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this ) { /* R8G8B8A8 pixel format */ msg_Dbg(p_this, "RGB pixel format is R8G8B8A8"); - //p_vout->chroma.pf_convert = E_(I420_B8G8R8A8); - return -1; + p_vout->chroma.pf_convert = E_(I420_R8G8B8A8); } else if( p_vout->output.i_rmask == 0x0000ff00 && p_vout->output.i_gmask == 0x00ff0000 diff --git a/modules/video_chroma/i420_rgb.h b/modules/video_chroma/i420_rgb.h index 1ba1c0de6f..5573ccee51 100644 --- a/modules/video_chroma/i420_rgb.h +++ b/modules/video_chroma/i420_rgb.h @@ -64,6 +64,7 @@ void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * ); +void E_(I420_R8G8B8A8) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_A8B8G8R8) ( vout_thread_t *, picture_t *, picture_t * ); #endif diff --git a/modules/video_chroma/i420_rgb16.c b/modules/video_chroma/i420_rgb16.c index fc622de24f..9f21869c73 100644 --- a/modules/video_chroma/i420_rgb16.c +++ b/modules/video_chroma/i420_rgb16.c @@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, #endif } +void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + vlc_bool_t b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_vout->render.i_width, p_vout->render.i_height, + p_vout->output.i_width, p_vout->output.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_vout->output.i_height : p_vout->render.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_vout->render.i_width & 15 ) + { + i_rewind = 16 - ( p_vout->render.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((int)p_y)| + ((int)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_RGBA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_RGBA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, picture_t *p_dest ) { diff --git a/modules/video_chroma/i420_rgb_mmx.h b/modules/video_chroma/i420_rgb_mmx.h index 3200a1f334..3ff27cb192 100644 --- a/modules/video_chroma/i420_rgb_mmx.h +++ b/modules/video_chroma/i420_rgb_mmx.h @@ -300,6 +300,26 @@ punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\ " +#define MMX_UNPACK_32_RGBA " \n\ +pxor %%mm3, %%mm3 # zero mm3 \n\ +movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\ +punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\ +movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\ +movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\ +punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\ +movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\ +pxor %%mm6, %%mm6 # zero mm6 \n\ +punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\ +punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\ +movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\ +punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\ +movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\ +punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\ +movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\ +" + #define MMX_UNPACK_32_BGRA " \n\ pxor %%mm3, %%mm3 # zero mm3 \n\ movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ @@ -356,15 +376,15 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\ #define MMX_END _mm_empty() #define MMX_INIT_16 \ - mm0 = _mm_cvtsi32_si64((int)*p_u); \ - mm1 = _mm_cvtsi32_si64((int)*p_v); \ + mm0 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ mm4 = _mm_setzero_si64(); \ - mm6 = (__m64)*(uint64_t *)p_y + mm6 = (__m64)*(uint64_t *)p_y; #define MMX_INIT_32 \ - mm0 = _mm_cvtsi32_si64((int)*p_u); \ + mm0 = _mm_cvtsi32_si64(*(int*)p_u); \ *(uint16_t *)p_buffer = 0; \ - mm1 = _mm_cvtsi32_si64((int)*p_v); \ + mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ mm4 = _mm_setzero_si64(); \ mm6 = (__m64)*(uint64_t *)p_y; @@ -483,6 +503,25 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\ mm0 = _mm_unpackhi_pi16(mm0, mm1); \ *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; +#define MMX_UNPACK_32_RGBA \ + mm3 = _mm_setzero_si64(); \ + mm4 = mm2; \ + mm4 = _mm_unpacklo_pi8(mm4, mm1); \ + mm3 = _mm_unpacklo_pi8(mm3, mm0); \ + mm5 = mm3; \ + mm3 = _mm_unpacklo_pi16(mm3, mm4); \ + *(uint64_t *)p_buffer = (uint64_t)mm3; \ + mm5 = _mm_unpackhi_pi16(mm5, mm4); \ + *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\ + mm6 = _mm_setzero_si64(); \ + mm2 = _mm_unpackhi_pi8(mm2, mm1); \ + mm6 = _mm_unpackhi_pi8(mm6, mm0); \ + mm0 = mm6; \ + mm6 = _mm_unpacklo_pi16(mm6, mm2); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\ + mm0 = _mm_unpackhi_pi16(mm0, mm2); \ + *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; + #define MMX_UNPACK_32_BGRA \ mm3 = _mm_setzero_si64(); \ mm4 = mm2; \ @@ -503,7 +542,23 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\ *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; #define MMX_UNPACK_32_ABGR \ - ; + mm3 = _mm_setzero_si64(); \ + mm4 = mm1; \ + mm4 = _mm_unpacklo_pi8(mm4, mm2); \ + mm5 = mm0; \ + mm5 = _mm_unpacklo_pi8(mm5, mm3); \ + mm6 = mm4; \ + mm4 = _mm_unpacklo_pi16(mm4, mm5); \ + *(uint64_t *)p_buffer = (uint64_t)mm4; \ + mm6 = _mm_unpackhi_pi16(mm6, mm5); \ + *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\ + mm1 = _mm_unpackhi_pi8(mm1, mm2); \ + mm0 = _mm_unpackhi_pi8(mm0, mm3); \ + mm2 = mm1; \ + mm1 = _mm_unpacklo_pi16(mm1, mm0); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\ + mm2 = _mm_unpackhi_pi16(mm2, mm0); \ + *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2; #endif @@ -795,6 +850,46 @@ punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\ " +#define SSE2_UNPACK_32_RGBA_ALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\ +punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\ +movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\ +movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\ +punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\ +movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\ +pxor %%xmm6, %%xmm6 # zero mm6 \n\ +punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\ +punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\ +movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\ +punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\ +movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\ +punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\ +movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\ +" + +#define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\ +punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\ +movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\ +movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\ +punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\ +movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\ +pxor %%xmm6, %%xmm6 # zero mm6 \n\ +punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\ +punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\ +movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\ +punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\ +movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\ +punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\ +movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\ +" + #define SSE2_UNPACK_32_BGRA_ALIGNED " \n\ pxor %%xmm3, %%xmm3 # zero mm3 \n\ movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ @@ -881,11 +976,11 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\ #include -#define SSE2_CALL(SSE2_INSTRUCTIONS) \ - do { \ - __m128i xmm0, xmm1, xmm2, xmm3, \ - xmm4, xmm5, xmm6, xmm7; \ - SSE2_INSTRUCTIONS \ +#define SSE2_CALL(SSE2_INSTRUCTIONS) \ + do { \ + __m128i xmm0, xmm1, xmm2, xmm3, \ + xmm4, xmm5, xmm6, xmm7; \ + SSE2_INSTRUCTIONS \ } while(0) #define SSE2_END _mm_sfence() @@ -971,179 +1066,249 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\ xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); -#define SSE2_UNPACK_15_ALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm1 = _mm_srli_epi16(xmm1, 1); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ +#define SSE2_UNPACK_15_ALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm1 = _mm_srli_epi16(xmm1, 1); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 2); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_stream_si128((__m128i*)p_buffer, xmm0); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 2); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)p_buffer, xmm0); \ \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 2); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 2); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); -#define SSE2_UNPACK_15_UNALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm1 = _mm_srli_epi16(xmm1, 1); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ +#define SSE2_UNPACK_15_UNALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm1 = _mm_srli_epi16(xmm1, 1); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 2); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 2); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 2); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 2); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5); -#define SSE2_UNPACK_16_ALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ +#define SSE2_UNPACK_16_ALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 3); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_stream_si128((__m128i*)p_buffer, xmm0); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 3); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)p_buffer, xmm0); \ \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 3); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); -#define SSE2_UNPACK_16_UNALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ +#define SSE2_UNPACK_16_UNALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 3); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 3); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 3); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); -#define SSE2_UNPACK_32_ARGB_ALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm0; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ - xmm5 = xmm1; \ - xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ - xmm6 = xmm4; \ - xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ - _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ - xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ +#define SSE2_UNPACK_32_ARGB_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm0; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm1; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \ - xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ - xmm5 = xmm0; \ - xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \ - xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); -#define SSE2_UNPACK_32_ARGB_UNALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm0; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ - xmm5 = xmm1; \ - xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ - xmm6 = xmm4; \ - xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ - _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ - xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ +#define SSE2_UNPACK_32_ARGB_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm0; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm1; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \ - xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ - xmm5 = xmm0; \ - xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \ - xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); -#define SSE2_UNPACK_32_BGRA_ALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm2; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm5 = xmm3; \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ - _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ - xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ +#define SSE2_UNPACK_32_RGBA_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \ - xmm6 = _mm_setzero_si128(); \ - xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ - xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ - xmm0 = xmm6; \ - xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ + xmm6 = _mm_setzero_si128(); \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \ + xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \ + xmm0 = xmm6; \ + xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \ - xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); -#define SSE2_UNPACK_32_BGRA_UNALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm2; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm5 = xmm3; \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ - _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ - xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ +#define SSE2_UNPACK_32_RGBA_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \ - xmm6 = _mm_setzero_si128(); \ - xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ - xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ - xmm0 = xmm6; \ - xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ + xmm6 = _mm_setzero_si128(); \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \ + xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \ + xmm0 = xmm6; \ + xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \ - xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); -#define SSE2_UNPACK_32_ABGR_ALIGNED \ - ; +#define SSE2_UNPACK_32_BGRA_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ + _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \ + xmm6 = _mm_setzero_si128(); \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ + xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ + xmm0 = xmm6; \ + xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); + +#define SSE2_UNPACK_32_BGRA_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ + _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \ + xmm6 = _mm_setzero_si128(); \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ + xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ + xmm0 = xmm6; \ + xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); -#define SSE2_UNPACK_32_ABGR_UNALIGNED \ - ; +#define SSE2_UNPACK_32_ABGR_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm1; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \ + xmm2 = xmm1; \ + xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \ + xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \ + _mm_stream_si128((__m128i*)(p_buffer+12), xmm2); + +#define SSE2_UNPACK_32_ABGR_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm1; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \ + xmm2 = xmm1; \ + xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \ + xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \ + _mm_storeu_si128((__m128i*)(p_buffer+12), xmm2); #endif diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h index a881c6e5a8..1f35a3061f 100644 --- a/modules/video_chroma/i420_yuy2.h +++ b/modules/video_chroma/i420_yuy2.h @@ -138,56 +138,56 @@ movq %%mm1, (%1) # Store YUYV \n\ #define MMX_END _mm_empty() #define MMX_YUV420_YUYV \ - mm1 = _mm_cvtsi32_si64((int)*p_u); \ - mm2 = _mm_cvtsi32_si64((int)*p_v); \ + mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ mm0 = (__m64)*(uint64_t*)p_y1; \ mm3 = (__m64)*(uint64_t*)p_y2; \ mm1 = _mm_unpacklo_pi8(mm1, mm2); \ mm2 = mm0; \ mm2 = _mm_unpacklo_pi8(mm2, mm1); \ - *(uin64_t)p_line1 = (uint64)mm2; \ + *(uint64_t*)p_line1 = (uint64_t)mm2; \ mm0 = _mm_unpackhi_pi8(mm0, mm1); \ - *(uin64_t)(p_line1 + 4) = (uint64)mm0; \ + *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\ mm4 = mm3; \ mm4 = _mm_unpacklo_pi8(mm4, mm1); \ - *(uin64_t)p_line2 = (uint64)mm4; \ + *(uint64_t*)p_line2 = (uint64_t)mm4; \ mm3 = _mm_unpackhi_pi8(mm3, mm1); \ - *(uin64_t)(p_line2 + 4) = (uint64)mm4; + *(uint64_t*)(p_line2+8) = (uint64_t)mm3; #define MMX_YUV420_YVYU \ - mm2 = _mm_cvtsi32_si64((int)*p_u); \ - mm1 = _mm_cvtsi32_si64((int)*p_v); \ + mm2 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ mm0 = (__m64)*(uint64_t*)p_y1; \ mm3 = (__m64)*(uint64_t*)p_y2; \ mm1 = _mm_unpacklo_pi8(mm1, mm2); \ mm2 = mm0; \ mm2 = _mm_unpacklo_pi8(mm2, mm1); \ - *(uin64_t)p_line1 = (uint64)mm2; \ + *(uint64_t*)p_line1 = (uint64_t)mm2; \ mm0 = _mm_unpackhi_pi8(mm0, mm1); \ - *(uin64_t)(p_line1 + 4) = (uint64)mm0; \ + *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\ mm4 = mm3; \ mm4 = _mm_unpacklo_pi8(mm4, mm1); \ - *(uin64_t)p_line2 = (uint64)mm4; \ + *(uint64_t*)p_line2 = (uint64_t)mm4; \ mm3 = _mm_unpackhi_pi8(mm3, mm1); \ - *(uin64_t)(p_line2 + 4) = (uint64)mm4; + *(uint64_t*)(p_line2+8) = (uint64_t)mm3; #define MMX_YUV420_UYVY \ - mm1 = _mm_cvtsi32_si64((int)*p_u); \ - mm2 = _mm_cvtsi32_si64((int)*p_v); \ + mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ mm0 = (__m64)*(uint64_t*)p_y1; \ mm3 = (__m64)*(uint64_t*)p_y2; \ mm1 = _mm_unpacklo_pi8(mm1, mm2); \ mm2 = mm1; \ mm2 = _mm_unpacklo_pi8(mm2, mm0); \ - *(uin64_t)p_line1 = (uint64)mm2; \ + *(uint64_t*)p_line1 = (uint64_t)mm2; \ mm2 = mm1; \ mm2 = _mm_unpackhi_pi8(mm2, mm0); \ - *(uin64_t)(p_line1 + 4) = (uint64)mm2; \ + *(uint64_t*)(p_line1+8) = (uint64_t)mm2;\ mm4 = mm1; \ mm4 = _mm_unpacklo_pi8(mm4, mm3); \ - *(uin64_t)p_line2 = (uint64)mm4; \ + *(uint64_t*)p_line2 = (uint64_t)mm4; \ mm1 = _mm_unpackhi_pi8(mm1, mm3); \ - *(uin64_t)(p_line2 + 4) = (uint64)mm1; + *(uint64_t*)(p_line2+8) = (uint64_t)mm1; #endif -- 2.39.2