+#define BIME_CACHE( dx, dy ) \
+{ \
+ int i = 4 + 3*dx + dy; \
+ stride0[i] = bw;\
+ stride1[i] = bw;\
+ src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
+ src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
+}
+
+#define BIME_CACHE2(a,b) \
+ BIME_CACHE(a,b) \
+ BIME_CACHE(-(a),-(b))
+
+#define SATD_THRESH 17/16
+
+#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
+if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
+{ \
+ int cost; \
+ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
+ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
+ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
+ h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \
+ cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
+ + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
+ + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
+ if( rd ) \
+ { \
+ if( cost < bcost * SATD_THRESH ) \
+ { \
+ uint64_t costrd; \
+ if( cost < bcost ) \
+ bcost = cost; \
+ *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \
+ *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \
+ costrd = x264_rd_cost_part( h, i_lambda2, i8, m0->i_pixel ); \
+ if( costrd < bcostrd ) \
+ {\
+ bcostrd = costrd;\
+ bm0x = m0x; \
+ bm0y = m0y; \
+ bm1x = m1x; \
+ bm1y = m1y; \
+ }\
+ } \
+ } \
+ else if( cost < bcost ) \
+ { \
+ bcost = cost; \
+ bm0x = m0x; \
+ bm0y = m0y; \
+ bm1x = m1x; \
+ bm1y = m1y; \
+ } \
+}
+
+#define CHECK_BIDIR(a,b,c,d) \
+ COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d)
+
+#define CHECK_BIDIR2(a,b,c,d) \
+ CHECK_BIDIR(a,b,c,d) \
+ CHECK_BIDIR(-(a),-(b),-(c),-(d))
+
+#define CHECK_BIDIR8(a,b,c,d) \
+ CHECK_BIDIR2(a,b,c,d) \
+ CHECK_BIDIR2(b,c,d,a) \
+ CHECK_BIDIR2(c,d,a,b) \
+ CHECK_BIDIR2(d,a,b,c)
+
+static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
+{
+ static const int pixel_mv_offs[] = { 0, 4, 4*8, 0 };
+ int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
+ int16_t *cache0_mv2 = cache0_mv + pixel_mv_offs[m0->i_pixel];
+ int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
+ int16_t *cache1_mv2 = cache1_mv + pixel_mv_offs[m0->i_pixel];
+ const int i_pixel = m0->i_pixel;
+ const int bw = x264_pixel_size[i_pixel].w;
+ const int bh = x264_pixel_size[i_pixel].h;
+ const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
+ const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
+ const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
+ const int16_t *p_cost_m1y = m1->p_cost_mv - x264_clip3( m1->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
+ DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] );
+ DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] );
+ DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ uint8_t *src0[9];
+ uint8_t *src1[9];
+ int stride0[9];
+ int stride1[9];
+ int bm0x = m0->mv[0], om0x = bm0x;
+ int bm0y = m0->mv[1], om0y = bm0y;
+ int bm1x = m1->mv[0], om1x = bm1x;
+ int bm1y = m1->mv[1], om1y = bm1y;
+ int bcost = COST_MAX;
+ int pass = 0;
+ uint64_t bcostrd = COST_MAX64;
+
+ /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
+ DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
+
+ if( bm0y > h->mb.mv_max_spel[1] - 8 ||
+ bm1y > h->mb.mv_max_spel[1] - 8 )
+ return;
+
+ h->mc.memzero_aligned( visited, sizeof(visited) );
+
+ BIME_CACHE( 0, 0 );
+ CHECK_BIDIR( 0, 0, 0, 0 );
+
+ for( pass = 0; pass < 8; pass++ )
+ {
+ /* check all mv pairs that differ in at most 2 components from the current mvs. */
+ /* doesn't do chroma ME. this probably doesn't matter, as the gains
+ * from bidir ME are the same with and without chroma ME. */
+
+ BIME_CACHE2( 1, 0 );
+ BIME_CACHE2( 0, 1 );
+ BIME_CACHE2( 1, 1 );
+ BIME_CACHE2( 1,-1 );
+
+ CHECK_BIDIR8( 0, 0, 0, 1 );
+ CHECK_BIDIR8( 0, 0, 1, 1 );
+ CHECK_BIDIR2( 0, 1, 0, 1 );
+ CHECK_BIDIR2( 1, 0, 1, 0 );
+ CHECK_BIDIR8( 0, 0,-1, 1 );
+ CHECK_BIDIR2( 0,-1, 0, 1 );
+ CHECK_BIDIR2(-1, 0, 1, 0 );
+
+ if( om0x == bm0x && om0y == bm0y && om1x == bm1x && om1y == bm1y )
+ break;
+
+ om0x = bm0x;
+ om0y = bm0y;
+ om1x = bm1x;
+ om1y = bm1y;
+ BIME_CACHE( 0, 0 );
+ }
+
+ m0->mv[0] = bm0x;
+ m0->mv[1] = bm0y;
+ m1->mv[0] = bm1x;
+ m1->mv[1] = bm1y;
+}
+
+void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight )
+{
+ x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 );
+}
+
+void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
+{
+ x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
+}
+
+#undef COST_MV_SATD
+#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \
+{ \
+ if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
+ { \
+ int stride = 16; \
+ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
+ dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ + p_cost_mvx[mx] + p_cost_mvy[my]; \
+ COPY1_IF_LT( bsatd, dst ); \
+ } \
+ else \
+ dst = COST_MAX; \
+}
+
+#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
+{ \
+ if( satd <= bsatd * SATD_THRESH ) \
+ { \
+ uint64_t cost; \
+ *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
+ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
+ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
+ } \
+}
+
+void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list )
+{
+ // don't have to fill the whole mv cache rectangle
+ static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
+ int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
+ int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
+ const int16_t *p_cost_mvx, *p_cost_mvy;
+ const int bw = x264_pixel_size[m->i_pixel].w>>2;
+ const int bh = x264_pixel_size[m->i_pixel].h>>2;
+ const int i_pixel = m->i_pixel;
+
+ DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
+ int bmx = m->mv[0];
+ int bmy = m->mv[1];
+ int omx = bmx;
+ int omy = bmy;
+ int pmx, pmy, i, j;
+ unsigned bsatd;
+ int satd = 0;
+ int dir = -2;
+ int satds[8];
+
+ if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
+ x264_mb_predict_mv( h, i_list, i4, bw, m->mvp );
+ pmx = m->mvp[0];
+ pmy = m->mvp[1];
+ p_cost_mvx = m->p_cost_mv - pmx;
+ p_cost_mvy = m->p_cost_mv - pmy;
+ COST_MV_SATD( bmx, bmy, bsatd, 0 );
+ COST_MV_RD( bmx, bmy, 0, 0, 0 );
+
+ /* check the predicted mv */
+ if( (bmx != pmx || bmy != pmy)
+ && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
+ && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
+ {
+ COST_MV_SATD( pmx, pmy, satd, 0 );
+ COST_MV_RD( pmx, pmy, satd, 0,0 );
+ /* The hex motion search is guaranteed to not repeat the center candidate,
+ * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
+ if( bmx == pmx && bmy == pmy )
+ {
+ pmx = m->mv[0];
+ pmy = m->mv[1];
+ }
+ }
+
+ /* subpel hex search, same pattern as ME HEX. */
+ dir = -2;
+ omx = bmx;
+ omy = bmy;
+ for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 );
+ for( j=0; j<6; j++ ) COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j );
+
+ if( dir != -2 )
+ {
+ /* half hexagon, not overlapping the previous iteration */
+ for( i = 1; i < 10; i++ )
+ {
+ const int odir = mod6m1[dir+1];
+ if( bmy > h->mb.mv_max_spel[1] - 2 ||
+ bmy < h->mb.mv_min_spel[1] - 2 )
+ break;
+ dir = -2;
+ omx = bmx;
+ omy = bmy;
+ for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 );
+ for( j=0; j<3; j++ ) COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j );
+ if( dir == -2 )
+ break;
+ }
+ }
+
+ /* square refine, same as pattern as ME HEX. */
+ omx = bmx;
+ omy = bmy;
+ for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i][0], omy + square1[i][1], satds[i], 1 );
+ for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i][0], omy + square1[i][1], satds[i], 0,0 );
+
+ bmy = x264_clip3( bmy, h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
+ m->cost = bcost;
+ m->mv[0] = bmx;
+ m->mv[1] = bmy;
+ x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) );
+ x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+}