Branchlessly handle elimination of candidates in MMX roundclip asm.
Add a new asm function, similar to roundclip, except without the round part.
Optimize and organize the C code, and make both subme>=3 and subme<3 consistent.
Add lots of explanatory comments and try to make things a little more understandable.
~5-10% faster with subme>=3, ~15-20% faster with subme<3.
return amvd0 + (amvd1<<8);
}
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
-{
- for( int i = 0; i < i_mvc; i++ )
- {
- int mx = (mvc[i][0] + 2) >> 2;
- int my = (mvc[i][1] + 2) >> 2;
- dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
- dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
- }
-}
-
extern const uint8_t x264_exp2_lut[64];
extern const float x264_log2_lut[128];
extern const float x264_log2_lz_lut[32];
int mv_miny_spel_row[3];
int mv_maxy_spel_row[3];
/* Fullpel MV range for motion search */
- int mv_min_fpel[2];
- int mv_max_fpel[2];
+ ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
int mv_miny_fpel_row[3];
int mv_maxy_fpel_row[3];
// included at the end because it needs x264_t
#include "macroblock.h"
+static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+ int cnt = 0;
+ for( int i = 0; i < i_mvc; i++ )
+ {
+ int mx = (mvc[i][0] + 2) >> 2;
+ int my = (mvc[i][1] + 2) >> 2;
+ uint32_t mv = pack16to32_mask(mx, my);
+ if( !mv || mv == pmv ) continue;
+ dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
+ dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
+ cnt++;
+ }
+ return cnt;
+}
+
+static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+ int cnt = 0;
+ int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
+ for( int i = 0; i < i_mvc; i++ )
+ {
+ uint32_t mv = M32( mvc[i] );
+ int mx = mvc[i][0];
+ int my = mvc[i][1];
+ if( !mv || mv == pmv ) continue;
+ dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
+ dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
+ cnt++;
+ }
+ return cnt;
+}
+
#if ARCH_X86 || ARCH_X86_64
#include "x86/util.h"
#endif
return amvd;
}
+#define x264_predictor_clip x264_predictor_clip_mmx2
+static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+ static const uint32_t pd_32 = 0x20;
+ intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
+ asm(
+ "movq (%2), %%mm5 \n"
+ "movd %6, %%mm3 \n"
+ "psllw $2, %%mm5 \n" // Convert to subpel
+ "pshufw $0xEE, %%mm5, %%mm6 \n"
+ "dec %k3 \n"
+ "jz 2f \n" // if( i_mvc == 1 ) {do the last iteration}
+ "punpckldq %%mm3, %%mm3 \n"
+ "punpckldq %%mm5, %%mm5 \n"
+ "movd %7, %%mm4 \n"
+ "lea (%0,%3,4), %3 \n"
+ "1: \n"
+ "movq (%0), %%mm0 \n"
+ "add $8, %0 \n"
+ "movq %%mm3, %%mm1 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "pcmpeqd %%mm0, %%mm1 \n" // mv == pmv
+ "pcmpeqd %%mm0, %%mm2 \n" // mv == 0
+ "por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
+ "pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf
+ "pmaxsw %%mm5, %%mm0 \n"
+ "pminsw %%mm6, %%mm0 \n"
+ "pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
+ "psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped
+ "movq %%mm0, (%5,%4,4) \n"
+ "and $24, %k2 \n"
+ "add $2, %4 \n"
+ "add $8, %k2 \n"
+ "shr $4, %k2 \n" // (4-val)>>1
+ "sub %2, %4 \n" // +1 for each valid motion vector
+ "cmp %3, %0 \n"
+ "jl 1b \n"
+ "jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration}
+
+ /* Do the last iteration */
+ "2: \n"
+ "movd (%0), %%mm0 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "pcmpeqd %%mm0, %%mm3 \n"
+ "pcmpeqd %%mm0, %%mm2 \n"
+ "por %%mm3, %%mm2 \n"
+ "pmovmskb %%mm2, %k2 \n"
+ "pmaxsw %%mm5, %%mm0 \n"
+ "pminsw %%mm6, %%mm0 \n"
+ "movd %%mm0, (%5,%4,4) \n"
+ "inc %4 \n"
+ "and $1, %k2 \n"
+ "sub %2, %4 \n" // output += !(mv == pmv || mv == 0)
+ "3: \n"
+ :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+ :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
+ );
+ return i;
+}
+
+/* Same as the above, except we do (mv + 2) >> 2 on the input. */
#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
-static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
- uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
- uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
static const uint64_t pw_2 = 0x0002000200020002ULL;
- intptr_t i = i_mvc;
+ static const uint32_t pd_32 = 0x20;
+ intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
asm(
- "movd %2, %%mm5 \n"
- "movd %3, %%mm6 \n"
- "movq %4, %%mm7 \n"
- "punpckldq %%mm5, %%mm5 \n"
- "punpckldq %%mm6, %%mm6 \n"
- "test $1, %0 \n"
- "jz 1f \n"
- "movd -4(%6,%0,4), %%mm0 \n"
- "paddw %%mm7, %%mm0 \n"
- "psraw $2, %%mm0 \n"
- "pmaxsw %%mm5, %%mm0 \n"
- "pminsw %%mm6, %%mm0 \n"
- "movd %%mm0, -4(%5,%0,4) \n"
- "dec %0 \n"
- "jz 2f \n"
- "1: \n"
- "movq -8(%6,%0,4), %%mm0 \n"
- "paddw %%mm7, %%mm0 \n"
- "psraw $2, %%mm0 \n"
- "pmaxsw %%mm5, %%mm0 \n"
- "pminsw %%mm6, %%mm0 \n"
- "movq %%mm0, -8(%5,%0,4) \n"
- "sub $2, %0 \n"
- "jnz 1b \n"
- "2: \n"
- :"+r"(i), "=m"(M64( dst ))
- :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
+ "movq (%2), %%mm5 \n"
+ "movq %6, %%mm7 \n"
+ "movd %7, %%mm3 \n"
+ "pshufw $0xEE, %%mm5, %%mm6 \n"
+ "dec %k3 \n"
+ "jz 2f \n"
+ "punpckldq %%mm3, %%mm3 \n"
+ "punpckldq %%mm5, %%mm5 \n"
+ "movd %8, %%mm4 \n"
+ "lea (%0,%3,4), %3 \n"
+ "1: \n"
+ "movq (%0), %%mm0 \n"
+ "add $8, %0 \n"
+ "paddw %%mm7, %%mm0 \n"
+ "psraw $2, %%mm0 \n"
+ "movq %%mm3, %%mm1 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "pcmpeqd %%mm0, %%mm1 \n"
+ "pcmpeqd %%mm0, %%mm2 \n"
+ "por %%mm1, %%mm2 \n"
+ "pmovmskb %%mm2, %k2 \n"
+ "pmaxsw %%mm5, %%mm0 \n"
+ "pminsw %%mm6, %%mm0 \n"
+ "pand %%mm4, %%mm2 \n"
+ "psrlq %%mm2, %%mm0 \n"
+ "movq %%mm0, (%5,%4,4) \n"
+ "and $24, %k2 \n"
+ "add $2, %4 \n"
+ "add $8, %k2 \n"
+ "shr $4, %k2 \n"
+ "sub %2, %4 \n"
+ "cmp %3, %0 \n"
+ "jl 1b \n"
+ "jg 3f \n"
+
+ /* Do the last iteration */
+ "2: \n"
+ "movd (%0), %%mm0 \n"
+ "paddw %%mm7, %%mm0 \n"
+ "psraw $2, %%mm0 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "pcmpeqd %%mm0, %%mm3 \n"
+ "pcmpeqd %%mm0, %%mm2 \n"
+ "por %%mm3, %%mm2 \n"
+ "pmovmskb %%mm2, %k2 \n"
+ "pmaxsw %%mm5, %%mm0 \n"
+ "pminsw %%mm6, %%mm0 \n"
+ "movd %%mm0, (%5,%4,4) \n"
+ "inc %4 \n"
+ "and $1, %k2 \n"
+ "sub %2, %4 \n"
+ "3: \n"
+ :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+ :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
);
+ return i;
}
#endif
if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
}
- h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
- h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
+ h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
+ h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
{
int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
- h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
- h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
+ h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
+ h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
}
if( PARAM_INTERLACED )
h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
- h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
- h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
+ h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
+ h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
}
#undef CLIP_FMV
(p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
#define COST_MV( mx, my )\
+do\
{\
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
&p_fref_w[(my)*stride+(mx)], stride )\
+ BITS_MVD(mx,my);\
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
-}
+} while(0)
-#define COST_MV_HPEL( mx, my ) \
-{ \
- intptr_t stride2 = 16; \
- pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
- int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
- + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
- COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
-}
+#define COST_MV_HPEL( mx, my, cost )\
+do\
+{\
+ intptr_t stride2 = 16;\
+ pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\
+ cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\
+ + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\
+} while(0)
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
}\
}
+#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */
+#define SPEL(mv) ((mv)<<2) /* ... and the reverse. */
+
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int i_pixel = m->i_pixel;
const int stride = m->i_stride[0];
int i_me_range = h->param.analyse.i_me_range;
- int bmx, bmy, bcost;
+ int bmx, bmy, bcost = COST_MAX;
int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
ALIGNED_ARRAY_16( pixel, pix,[16*16] );
+ ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
int costs[16];
- int mv_x_min = h->mb.mv_min_fpel[0];
- int mv_y_min = h->mb.mv_min_fpel[1];
- int mv_x_max = h->mb.mv_max_fpel[0];
- int mv_y_max = h->mb.mv_max_fpel[1];
- int mv_x_min_qpel = mv_x_min << 2;
- int mv_y_min_qpel = mv_y_min << 2;
- int mv_x_max_qpel = mv_x_max << 2;
- int mv_y_max_qpel = mv_y_max << 2;
+ int mv_x_min = h->mb.mv_limit_fpel[0][0];
+ int mv_y_min = h->mb.mv_limit_fpel[0][1];
+ int mv_x_max = h->mb.mv_limit_fpel[1][0];
+ int mv_y_max = h->mb.mv_limit_fpel[1][1];
/* Special version of pack to allow shortcuts in CHECK_MVRANGE */
#define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
+ uint32_t pmv;
#define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
- uint32_t pmv;
- bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
- bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
- pmx = ( bmx + 2 ) >> 2;
- pmy = ( bmy + 2 ) >> 2;
- bcost = COST_MAX;
-
- /* try extra predictors if provided */
+ /* Try extra predictors if provided. If subme >= 3, check subpel predictors,
+ * otherwise round them to fullpel. */
if( h->mb.i_subpel_refine >= 3 )
{
- pmv = pack16to32_mask(bmx,bmy);
- COST_MV_HPEL( bmx, bmy );
- for( int i = 0; i < i_mvc; i++ )
+ /* Calculate and check the MVP first */
+ bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
+ bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
+ pmv = pack16to32_mask( bpred_mx, bpred_my );
+ pmx = FPEL( bpred_mx );
+ pmy = FPEL( bpred_my );
+
+ COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );
+ int pmv_cost = bpred_cost;
+
+ if( i_mvc > 0 )
{
- if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
+ /* Clip MV candidates and eliminate those equal to zero and pmv. */
+ int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+ if( valid_mvcs > 0 )
{
- int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
- int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
- COST_MV_HPEL( mx, my );
+ int i = 1, cost;
+ /* We stuff pmv here to branchlessly pick between pmv and the various
+ * MV candidates. [0] gets skipped in order to maintain alignment for
+ * x264_predictor_clip. */
+ M32( mvc_temp[1] ) = pmv;
+ bpred_cost <<= 4;
+ do
+ {
+ int mx = mvc_temp[i+1][0];
+ int my = mvc_temp[i+1][1];
+ COST_MV_HPEL( mx, my, cost );
+ COPY1_IF_LT( bpred_cost, (cost << 4) + i );
+ } while( ++i <= valid_mvcs );
+ bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
+ bpred_my = mvc_temp[(bpred_cost&15)+1][1];
+ bpred_cost >>= 4;
}
}
- bmx = ( bpred_mx + 2 ) >> 2;
- bmy = ( bpred_my + 2 ) >> 2;
- COST_MV( bmx, bmy );
+
+ /* Round the best predictor back to fullpel and get the cost, since this is where
+ * we'll be starting the fullpel motion search. */
+ bmx = FPEL( bpred_mx );
+ bmy = FPEL( bpred_my );
+ if( (bpred_mx|bpred_my)&0x3 ) /* Only test if the tested predictor is actually subpel... */
+ COST_MV( bmx, bmy );
+ else /* Otherwise just copy the cost (we already know it) */
+ bcost = bpred_cost;
+
+ /* Test the zero vector if it hasn't been tested yet. */
+ if( pmv )
+ {
+ if( bmx|bmy ) COST_MV( 0, 0 );
+ }
+ /* If a subpel mv candidate was better than the zero vector, the previous
+ * fullpel check won't have gotten it even if the pmv was zero. So handle
+ * that possibility here. */
+ else
+ {
+ COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
+ }
}
else
{
- /* check the MVP */
- bmx = pmx;
- bmy = pmy;
+ /* Calculate and check the fullpel MVP first */
+ bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );
+ bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
+ pmv = pack16to32_mask( bmx, bmy );
+
/* Because we are rounding the predicted motion vector to fullpel, there will be
* an extra MV cost in 15 out of 16 cases. However, when the predicted MV is
* chosen as the best predictor, it is often the case that the subpel search will
- * result in a vector at or next to the predicted motion vector. Therefore, it is
- * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
- * biasing against use of the predicted motion vector. */
+ * result in a vector at or next to the predicted motion vector. Therefore, we omit
+ * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
+ * the predicted motion vector.
+ *
+ * Disclaimer: this is a post-hoc rationalization for why this hack works. */
bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
- pmv = pack16to32_mask( bmx, bmy );
+
if( i_mvc > 0 )
{
- ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
- x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
- M32( mvc_fpel[1] ) = pmv;
- bcost <<= 4;
- for( int i = 1; i <= i_mvc; i++ )
+ /* Like in subme>=3, except we also round the candidates to fullpel. */
+ int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+ if( valid_mvcs > 0 )
{
- if( M32( mvc_fpel[i+1] ) && (pmv != M32( mvc_fpel[i+1] )) )
+ int i = 1, cost;
+ M32( mvc_temp[1] ) = pmv;
+ bcost <<= 4;
+ do
{
- int mx = mvc_fpel[i+1][0];
- int my = mvc_fpel[i+1][1];
- int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
- cost = (cost << 4) + i;
- COPY1_IF_LT( bcost, cost );
- }
+ int mx = mvc_temp[i+1][0];
+ int my = mvc_temp[i+1][1];
+ cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
+ COPY1_IF_LT( bcost, (cost << 4) + i );
+ } while( ++i <= valid_mvcs );
+ bmx = mvc_temp[(bcost&15)+1][0];
+ bmy = mvc_temp[(bcost&15)+1][1];
+ bcost >>= 4;
}
- bmx = mvc_fpel[(bcost&15)+1][0];
- bmy = mvc_fpel[(bcost&15)+1][1];
- bcost >>= 4;
}
- }
- COST_MV( 0, 0 );
+ /* Same as above, except the condition is simpler. */
+ if( pmv )
+ COST_MV( 0, 0 );
+ }
switch( h->mb.i_me_method )
{
}
else
{
- m->mv[0] = bmx << 2;
- m->mv[1] = bmy << 2;
+ m->mv[0] = SPEL(bmx);
+ m->mv[1] = SPEL(bmy);
m->cost = bcost;
}
goto lowres_intra_mb;
// no need for h->mb.mv_min[]
- h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4;
- h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
- h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 );
- h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 );
+ h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
+ h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
+ h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
+ h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
{
- h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4;
- h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
- h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 );
- h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 );
+ h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
+ h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
+ h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
+ h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
}
#define LOAD_HPELS_LUMA(dst, src) \