+}
+
+void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight )
+{
+ x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 );
+}
+
+void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
+{
+ x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
+}
+
+#undef COST_MV_SATD
+#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \
+{ \
+ if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
+ { \
+ int stride = 16; \
+ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
+ dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ + p_cost_mvx[mx] + p_cost_mvy[my]; \
+ COPY1_IF_LT( bsatd, dst ); \
+ } \
+ else \
+ dst = COST_MAX; \
+}
+
+#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
+{ \
+ if( satd <= bsatd * SATD_THRESH ) \
+ { \
+ uint64_t cost; \
+ *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
+ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
+ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
+ } \
+}
+
+void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list )
+{
+ // don't have to fill the whole mv cache rectangle
+ static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
+ int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
+ int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
+ const int16_t *p_cost_mvx, *p_cost_mvy;
+ const int bw = x264_pixel_size[m->i_pixel].w>>2;
+ const int bh = x264_pixel_size[m->i_pixel].h>>2;
+ const int i_pixel = m->i_pixel;
+
+ DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
+ int bmx = m->mv[0];
+ int bmy = m->mv[1];
+ int omx = bmx;
+ int omy = bmy;
+ int pmx, pmy, i, j;
+ unsigned bsatd;
+ int satd = 0;
+ int dir = -2;
+ int satds[8];
+
+ if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
+ x264_mb_predict_mv( h, i_list, i4, bw, m->mvp );
+ pmx = m->mvp[0];
+ pmy = m->mvp[1];
+ p_cost_mvx = m->p_cost_mv - pmx;
+ p_cost_mvy = m->p_cost_mv - pmy;
+ COST_MV_SATD( bmx, bmy, bsatd, 0 );
+ COST_MV_RD( bmx, bmy, 0, 0, 0 );
+
+ /* check the predicted mv */
+ if( (bmx != pmx || bmy != pmy)
+ && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
+ && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
+ {
+ COST_MV_SATD( pmx, pmy, satd, 0 );
+ COST_MV_RD( pmx, pmy, satd, 0,0 );
+ /* The hex motion search is guaranteed to not repeat the center candidate,
+ * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
+ if( bmx == pmx && bmy == pmy )
+ {
+ pmx = m->mv[0];
+ pmy = m->mv[1];
+ }
+ }
+
+ /* subpel hex search, same pattern as ME HEX. */
+ dir = -2;
+ omx = bmx;
+ omy = bmy;
+ for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 );
+ for( j=0; j<6; j++ ) COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j );
+
+ if( dir != -2 )
+ {
+ /* half hexagon, not overlapping the previous iteration */
+ for( i = 1; i < 10; i++ )
+ {
+ const int odir = mod6m1[dir+1];
+ if( bmy > h->mb.mv_max_spel[1] - 2 ||
+ bmy < h->mb.mv_min_spel[1] - 2 )
+ break;
+ dir = -2;
+ omx = bmx;
+ omy = bmy;
+ for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 );
+ for( j=0; j<3; j++ ) COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j );
+ if( dir == -2 )
+ break;
+ }
+ }
+
+ /* square refine, same as pattern as ME HEX. */
+ omx = bmx;
+ omy = bmy;
+ for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i][0], omy + square1[i][1], satds[i], 1 );
+ for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i][0], omy + square1[i][1], satds[i], 0,0 );
+
+ bmy = x264_clip3( bmy, h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
+ m->cost = bcost;
+ m->mv[0] = bmx;
+ m->mv[1] = bmy;
+ x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) );
+ x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );