;-----------------------------------------------------------------------------
x264_mc_chroma_mmxext:
+ mov r10d, parm6d
+ mov r11d, parm5d
+ sar r10d, 3
+ sar r11d, 3
+ imul r10d, parm2d
+ pxor mm3, mm3
+ add r10d, r11d
+ movsxd r10, r10d
+ add parm1q, r10 ; src += (dx>>3) + (dy>>3) * src_stride
+ and parm5d, 7 ; dx &= 7
+ je .mc1d
+ and parm6d, 7 ; dy &= 7
+ je .mc1d
+
movd mm0, parm5d
movd mm1, parm6d
- pxor mm3, mm3
-
- pshufw mm5, mm0, 0 ; mm5 - dx
- pshufw mm6, mm1, 0 ; mm6 - dy
+ pshufw mm5, mm0, 0 ; mm5 = dx
+ pshufw mm6, mm1, 0 ; mm6 = dy
movq mm4, [pw_8 GLOBAL]
movq mm0, mm4
- psubw mm4, mm5 ; mm4 - 8-dx
- psubw mm0, mm6 ; mm0 - 8-dy
+ psubw mm4, mm5 ; mm4 = 8-dx
+ psubw mm0, mm6 ; mm0 = 8-dy
movq mm7, mm5
pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB
pmullw mm1, mm7 ; line * cD
paddw mm0, mm2
paddw mm0, mm1
-
psrlw mm0, 6
+
+%macro HEIGHT_LOOP_END 1
packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
movd [r10], mm0
add r10, parm4q ; i_dst_stride
dec r11d
- jnz .height_loop
+ jnz %1
sub parm7d, 8
jnz .finish ; width != 8 so assume 4
mov r11d, parm8d ; i_height
add r10, 4
add rax, 4
- jmp .height_loop
+ jmp %1
+%endmacro
+HEIGHT_LOOP_END .height_loop
.finish
ret
+
+ALIGN 4
+.mc1d
+%ifdef WIN64
+%define pel_offset rsi
+%else
+%define pel_offset r9
+%endif
+ mov eax, parm5d
+ or eax, parm6d
+ and eax, 7
+ cmp parm5d, 0
+ mov pel_offset, 1
+ cmove pel_offset, parm2q ; pel_offset = dx ? 1 : src_stride
+ movd mm6, eax
+ movq mm5, [pw_8 GLOBAL]
+ pshufw mm6, mm6, 0
+ movq mm7, [pw_4 GLOBAL]
+ psubw mm5, mm6
+
+ mov rax, parm1q
+ mov r10, parm3q
+ mov r11d, parm8d
+ALIGN 4
+.height_loop1
+ movd mm0, [rax+pel_offset]
+ movd mm1, [rax]
+ punpcklbw mm0, mm3
+ punpcklbw mm1, mm3
+ pmullw mm0, mm6
+ pmullw mm1, mm5
+ paddw mm0, mm7
+ paddw mm0, mm1
+ psrlw mm0, 3
+HEIGHT_LOOP_END .height_loop1
+ nop
;-----------------------------------------------------------------------------
x264_mc_chroma_mmxext:
-
picpush ebx
picgetgot ebx
+ push edi
+
+ mov ecx, [picesp+4+24]
+ mov edx, [picesp+4+20]
+ mov eax, ecx
+ mov edi, edx
+ sar ecx, 3
+ sar edx, 3
+ imul ecx, [picesp+4+8]
+ add ecx, edx
+ add [picesp+4+4], ecx ; src += (dx>>3) + (dy>>3) * src_stride
pxor mm3, mm3
- pshufw mm5, [picesp+20], 0 ; mm5 = dx
- pshufw mm6, [picesp+24], 0 ; mm6 = dy
+ and edi, 7
+ and eax, 7
+ movd mm5, edi
+ movd mm6, eax
+ pshufw mm5, mm5, 0 ; mm5 = dx&7
+ pshufw mm6, mm6, 0 ; mm6 = dy&7
movq mm4, [pw_8 GOT_ebx]
movq mm0, mm4
pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
- push edi
-
mov eax, [picesp+4+4] ; src
mov edi, [picesp+4+12] ; dst
mov ecx, [picesp+4+8] ; i_src_stride
}
}
-#ifdef HAVE_MMXEXT
-static void motion_compensation_chroma_mmxext( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int mvx, int mvy,
- int i_width, int i_height )
-{
- if (i_width == 2) {
- motion_compensation_chroma(src, i_src_stride, dst, i_dst_stride,
- mvx, mvy, i_width, i_height);
- } else {
- const int d8x = mvx&0x07;
- const int d8y = mvy&0x07;
-
- src += (mvy >> 3) * i_src_stride + (mvx >> 3);
-
- x264_mc_chroma_mmxext( src, i_src_stride, dst, i_dst_stride,
- d8x, d8y, i_width, i_height );
- }
-}
-#endif
-
#define MC_COPY(W) \
static void mc_copy_w##W( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_height ) \
{ \
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT ) {
x264_mc_mmxext_init( pf );
- pf->mc_chroma = motion_compensation_chroma_mmxext;
+ pf->mc_chroma = x264_mc_chroma_mmxext;
}
#endif
#ifdef HAVE_SSE2
int mvx, int mvy,
int i_width, int i_height );
+ /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
+ * so it must be run from left to right. */
void (*mc_chroma)(uint8_t *, int, uint8_t *, int,
int mvx, int mvy,
int i_width, int i_height );
}
}
-static void mc_chroma_c( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int mvx, int mvy,
- int i_width, int i_height )
-{
- uint8_t *srcp;
- int x, y;
- int d8x = mvx & 0x07;
- int d8y = mvy & 0x07;
-
- DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
- coeff[0] = (8-d8x)*(8-d8y);
- coeff[1] = d8x *(8-d8y);
- coeff[2] = (8-d8x)*d8y;
- coeff[3] = d8x *d8y;
-
- src += (mvy >> 3) * i_src_stride + (mvx >> 3);
- srcp = &src[i_src_stride];
-
- /* TODO: optimize */
- for( y = 0; y < i_height; y++ )
- {
- for( x = 0; x < i_width; x++ )
- {
- dst[x] = ( coeff[0]*src[x] + coeff[1]*src[x+1] +
- coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
- }
- dst += i_dst_stride;
-
- src = srcp;
- srcp += i_src_stride;
- }
-}
-
#define DO_PROCESS(a) \
src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); \
src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
{
mc_chroma_altivec_8xh( src, i_src_stride, dst, i_dst_stride,
mvx, mvy, i_height );
- return;
}
- if( i_width == 4 )
+ else
{
mc_chroma_altivec_4xh( src, i_src_stride, dst, i_dst_stride,
mvx, mvy, i_height );
- return;
}
-
- mc_chroma_c( src, i_src_stride, dst, i_dst_stride,
- mvx, mvy, i_width, i_height );
}
void x264_mc_altivec_init( x264_mc_functions_t *pf )
uint8_t *dst1 = &buf3[2*32+2];
uint8_t *dst2 = &buf4[2*32+2];
- int dx, dy, i, w;
+ int dx, dy, i, j, w;
int ret = 0, ok, used_asm;
x264_mc_init( 0, &mc_c );
memset(buf4, 0xCD, 1024); \
mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h ); \
mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h ); \
+ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
+ for( j=0; j<h; j++ ) \
+ for( i=w; i<4; i++ ) \
+ dst2[i+j*16] = dst1[i+j*16]; \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
report( "mc luma :" );
ok = 1; used_asm = 0;
- for( dy = 0; dy < 9; dy++ )
- for( dx = 0; dx < 9; dx++ )
+ for( dy = -1; dy < 9; dy++ )
+ for( dx = -1; dx < 9; dx++ )
{
MC_TEST_CHROMA( 8, 8 );
MC_TEST_CHROMA( 8, 4 );