Caused significantly worse compression. Preset-wise, only affected veryfast.
Fixed by not modifying mvc in-place.
return amvd0 + (amvd1<<8);
}
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
for( int i = 0; i < i_mvc; i++ )
{
int mx = (mvc[i][0] + 2) >> 2;
int my = (mvc[i][1] + 2) >> 2;
- mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
- mvc[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
+ dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
+ dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
}
}
}
#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
-static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
"punpckldq %%mm6, %%mm6 \n"
"test $1, %0 \n"
"jz 1f \n"
- "movd -4(%5,%0,4), %%mm0 \n"
+ "movd -4(%6,%0,4), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pmaxsw %%mm5, %%mm0 \n"
"dec %0 \n"
"jz 2f \n"
"1: \n"
- "movq -8(%5,%0,4), %%mm0 \n"
+ "movq -8(%6,%0,4), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pmaxsw %%mm5, %%mm0 \n"
"sub $2, %0 \n"
"jnz 1b \n"
"2: \n"
- :"+r"(i), "+m"(M64( mvc ))
- :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
+ :"+r"(i), "=m"(M64( dst ))
+ :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
);
}
pmv = pack16to32_mask( bmx, bmy );
if( i_mvc > 0 )
{
- x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
+ ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16][2] );
+ x264_predictor_roundclip( mvc_fpel, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
bcost <<= 4;
for( int i = 1; i <= i_mvc; i++ )
{
- if( M32( mvc[i-1] ) && (pmv != M32( mvc[i-1] )) )
+ if( M32( mvc_fpel[i-1] ) && (pmv != M32( mvc[i-1] )) )
{
- int mx = mvc[i-1][0];
- int my = mvc[i-1][1];
+ int mx = mvc_fpel[i-1][0];
+ int my = mvc_fpel[i-1][1];
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
cost = (cost << 4) + i;
COPY1_IF_LT( bcost, cost );
}
if( bcost&15 )
{
- bmx = mvc[(bcost&15)-1][0];
- bmy = mvc[(bcost&15)-1][1];
+ bmx = mvc_fpel[(bcost&15)-1][0];
+ bmy = mvc_fpel[(bcost&15)-1][1];
}
bcost >>= 4;
}