From abbd6c56da04a9e10d10a4bd158104826e8fc81a Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Mon, 14 Mar 2005 13:05:57 +0000 Subject: [PATCH] SSE optimized chroma MC. patch by Radek Czyz. git-svn-id: svn://svn.videolan.org/x264/trunk@170 df754926-b1dd-0310-bc7b-ec298dee348c --- common/i386/mc-a.asm | 95 ++++++++++++++++++++++++++++++++++++++++++++ common/i386/mc.h | 4 ++ common/mc.c | 25 +++++++++++- 3 files changed, 123 insertions(+), 1 deletion(-) diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 9ee4191a..ebc68d85 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -77,6 +77,8 @@ cglobal x264_mc_copy_w8_mmxext cglobal x264_mc_copy_w16_mmxext cglobal x264_mc_copy_w16_sse2 +cglobal x264_mc_chroma_sse + ALIGN 16 ;----------------------------------------------------------------------------- @@ -392,3 +394,96 @@ ALIGN 4 pop esi pop ebx ret + + +SECTION .rodata + +ALIGN 16 +eights times 4 dw 8 +thirty2s times 4 dw 32 + +SECTION .text + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_chroma_sse( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, +; int dx, int dy, +; int i_height, int i_width ) +;----------------------------------------------------------------------------- + +x264_mc_chroma_sse: + + pxor mm3, mm3 + + pshufw mm5, [esp+20], 0 ; mm5 - dx + pshufw mm6, [esp+24], 0 ; mm6 - dy + + movq mm4, [eights] + movq mm0, mm4 + + psubw mm4, mm5 ; mm4 - 8-dx + psubw mm0, mm6 ; mm0 - 8-dy + + movq mm7, mm5 + pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB + pmullw mm7, mm6 ; mm7 = dx*dy = cD + pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC + pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA + + push edi + + mov eax, [esp+4+4] ; src + mov edi, [esp+4+12] ; dst + mov ecx, [esp+4+8] ; i_src_stride + mov edx, [esp+4+28] ; i_height + +ALIGN 4 +.height_loop + + movd mm1, [eax+ecx] + movd mm0, [eax] + punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 + punpcklbw mm0, mm3 + pmullw mm1, mm6 ; 2nd line * cC + pmullw mm0, mm4 ; 1st line * cA + + paddw mm0, mm1 ; mm0 <- result + + movd mm2, [eax+1] + movd mm1, [eax+ecx+1] + punpcklbw mm2, mm3 + punpcklbw mm1, mm3 + + paddw mm0, [thirty2s] + + pmullw mm2, mm5 ; line * cB + pmullw mm1, mm7 ; line * cD + paddw mm0, mm2 + paddw mm0, mm1 + + psrlw mm0, 6 + packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 + movd [edi], mm0 + + add eax, ecx + add edi, [esp+4+16] + + dec edx + jnz .height_loop + + mov eax, [esp+4+32] + sub eax, 8 + jnz .finish ; width != 8 so assume 4 + + mov [esp+4+32], eax + mov edi, [esp+4+12] ; dst + mov eax, [esp+4+4] ; src + mov edx, [esp+4+28] ; i_height + add edi, 4 + add eax, 4 + jmp .height_loop + +.finish + pop edi + ret diff --git a/common/i386/mc.h b/common/i386/mc.h index a07c8dc8..69766167 100644 --- a/common/i386/mc.h +++ b/common/i386/mc.h @@ -27,4 +27,8 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf ); void x264_mc_sse2_init( x264_mc_functions_t *pf ); +void x264_mc_chroma_sse( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, + int i_height, int i_width ); #endif diff --git a/common/mc.c b/common/mc.c index ea085daa..dbe05a39 100644 --- a/common/mc.c +++ b/common/mc.c @@ -379,6 +379,27 @@ static void motion_compensation_chroma( uint8_t *src, int i_src_stride, } } +#ifdef HAVE_MMXEXT +static void motion_compensation_chroma_sse( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int mvx, int mvy, + int i_width, int i_height ) +{ + if (i_width == 2) { + motion_compensation_chroma(src, i_src_stride, dst, i_dst_stride, + mvx, mvy, i_width, i_height); + } else { + const int d8x = mvx&0x07; + const int d8y = mvy&0x07; + + src += (mvy >> 3) * i_src_stride + (mvx >> 3); + + x264_mc_chroma_sse(src, i_src_stride, dst, i_dst_stride, + d8x, d8y, i_height, i_width); + } +} +#endif + void x264_mc_init( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma; @@ -386,8 +407,10 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->mc_chroma = motion_compensation_chroma; #ifdef HAVE_MMXEXT - if( cpu&X264_CPU_MMXEXT ) + if( cpu&X264_CPU_MMXEXT ) { x264_mc_mmxext_init( pf ); + pf->mc_chroma = motion_compensation_chroma_sse; + } #endif #ifdef HAVE_SSE2 if( cpu&X264_CPU_SSE2 ) -- 2.39.2