+}
+
+void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight )
+{
+ x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 );
+}
+
+void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
+{
+ /* Motion compensation is done as part of bidir_rd; don't repeat
+ * it in encoding. */
+ h->mb.b_skip_mc = 1;
+ x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
+ h->mb.b_skip_mc = 0;
+}
+
+#undef COST_MV_SATD
+#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \
+{ \
+ if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
+ { \
+ h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
+ + p_cost_mvx[mx] + p_cost_mvy[my]; \
+ COPY1_IF_LT( bsatd, dst ); \
+ } \
+ else \
+ dst = COST_MAX; \
+}
+
+#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
+{ \
+ if( satd <= bsatd * SATD_THRESH ) \
+ { \
+ uint64_t cost; \
+ M32( cache_mv ) = pack16to32_mask(mx,my); \
+ M32( cache_mv2 ) = pack16to32_mask(mx,my); \
+ if( m->i_pixel <= PIXEL_8x8 )\
+ {\
+ h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
+ h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
+ }\
+ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
+ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
+ } \
+}
+
+void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list )
+{
+ // don't have to fill the whole mv cache rectangle
+ static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
+ int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
+ int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
+ const uint16_t *p_cost_mvx, *p_cost_mvy;
+ const int bw = x264_pixel_size[m->i_pixel].w;
+ const int bh = x264_pixel_size[m->i_pixel].h;
+ const int i_pixel = m->i_pixel;
+ const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+
+ uint64_t bcost = COST_MAX64;
+ int bmx = m->mv[0];
+ int bmy = m->mv[1];
+ int omx, omy, pmx, pmy, i, j;
+ unsigned bsatd;
+ int satd;
+ int dir = -2;
+ int i8 = i4>>2;
+
+ uint8_t *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
+ uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+
+ h->mb.b_skip_mc = 1;
+
+ if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
+ x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp );
+ pmx = m->mvp[0];
+ pmy = m->mvp[1];
+ p_cost_mvx = m->p_cost_mv - pmx;
+ p_cost_mvy = m->p_cost_mv - pmy;
+ COST_MV_SATD( bmx, bmy, bsatd, 0 );
+ if( m->i_pixel != PIXEL_16x16 )
+ COST_MV_RD( bmx, bmy, 0, 0, 0 )
+ else
+ bcost = m->cost;
+
+ /* check the predicted mv */
+ if( (bmx != pmx || bmy != pmy)
+ && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
+ && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
+ {
+ COST_MV_SATD( pmx, pmy, satd, 0 );
+ COST_MV_RD ( pmx, pmy, satd, 0, 0 );
+ /* The hex motion search is guaranteed to not repeat the center candidate,
+ * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
+ if( bmx == pmx && bmy == pmy )
+ {
+ pmx = m->mv[0];
+ pmy = m->mv[1];
+ }
+ }
+
+ if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 ||
+ bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 )
+ {
+ h->mb.b_skip_mc = 0;
+ return;
+ }
+
+ /* subpel hex search, same pattern as ME HEX. */
+ dir = -2;
+ omx = bmx;
+ omy = bmy;
+ for( j=0; j<6; j++ )
+ {
+ COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 );
+ COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j );
+ }
+
+ if( dir != -2 )
+ {
+ /* half hexagon, not overlapping the previous iteration */
+ for( i = 1; i < 10; i++ )
+ {
+ const int odir = mod6m1[dir+1];
+ if( bmy < h->mb.mv_min_spel[1] + 3 ||
+ bmy > h->mb.mv_max_spel[1] - 3 )
+ break;
+ dir = -2;
+ omx = bmx;
+ omy = bmy;
+ for( j=0; j<3; j++ )
+ {
+ COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 );
+ COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j );
+ }
+ if( dir == -2 )
+ break;
+ }
+ }
+
+ /* square refine, same pattern as ME HEX. */
+ omx = bmx;
+ omy = bmy;
+ for( i=0; i<8; i++ )
+ {
+ COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 );
+ COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 );
+ }
+
+ m->cost = bcost;
+ m->mv[0] = bmx;
+ m->mv[1] = bmy;
+ x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
+ x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+ h->mb.b_skip_mc = 0;