+ x264_mb_cache_mv_b16x8( h, a, i, 0 );
+ }
+
+ /* mb type cost */
+ a->i_mb_type16x8 = B_L0_L0
+ + (a->i_mb_partition16x8[0]>>2) * 3
+ + (a->i_mb_partition16x8[1]>>2);
+ a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
+}
+
+static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
+{
+ ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
+ ALIGNED_4( int16_t mvc[3][2] );
+
+ h->mb.i_partition = D_8x16;
+ a->i_cost8x16bi = 0;
+
+ for( int i = 0; i < 2; i++ )
+ {
+ int i_part_cost;
+ int i_part_cost_bi = 0;
+ int stride[2] = {8,8};
+ uint8_t *src[2];
+ x264_me_t m;
+ m.i_pixel = PIXEL_8x16;
+ LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
+
+ for( int l = 0; l < 2; l++ )
+ {
+ x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+ int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
+ int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
+ lX->me8x16[i].cost = INT_MAX;
+ for( int j = 0; j < i_ref8s; j++ )
+ {
+ int i_ref = ref8[j];
+ m.i_ref_cost = REF_COST( l, i_ref );
+
+ LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
+
+ CP32( mvc[0], lX->mvc[i_ref][0] );
+ CP32( mvc[1], lX->mvc[i_ref][i+1] );
+ CP32( mvc[2], lX->mvc[i_ref][i+3] );
+
+ x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
+ x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
+ x264_me_search( h, &m, mvc, 3 );
+ m.cost += m.i_ref_cost;
+
+ if( m.cost < lX->me8x16[i].cost )
+ h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
+ }
+ }
+
+ /* BI mode */
+ src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
+ a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
+ src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
+ a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
+ h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
+
+ i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
+ + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
+ + a->l1.me8x16[i].i_ref_cost;
+
+ i_part_cost = a->l0.me8x16[i].cost;
+ a->i_mb_partition8x16[i] = D_L0_8x8;
+
+ if( a->l1.me8x16[i].cost < i_part_cost )
+ {
+ i_part_cost = a->l1.me8x16[i].cost;
+ a->i_mb_partition8x16[i] = D_L1_8x8;
+ }
+ if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
+ {
+ i_part_cost = i_part_cost_bi;
+ a->i_mb_partition8x16[i] = D_BI_8x8;
+ }
+ a->i_cost8x16bi += i_part_cost;
+
+ x264_mb_cache_mv_b8x16( h, a, i, 0 );
+ }
+
+ /* mb type cost */
+ a->i_mb_type8x16 = B_L0_L0
+ + (a->i_mb_partition8x16[0]>>2) * 3
+ + (a->i_mb_partition8x16[1]>>2);
+ a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
+}
+
+static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
+{
+ int thresh = i_satd * 5/4;
+
+ h->mb.i_type = P_L0;
+ if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
+ {
+ h->mb.i_partition = D_16x16;
+ x264_analyse_update_cache( h, a );
+ a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+
+ if( a->l0.i_cost16x8 <= thresh )
+ {
+ h->mb.i_partition = D_16x8;
+ x264_analyse_update_cache( h, a );
+ a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+ else
+ a->l0.i_cost16x8 = COST_MAX;
+
+ if( a->l0.i_cost8x16 <= thresh )
+ {
+ h->mb.i_partition = D_8x16;
+ x264_analyse_update_cache( h, a );
+ a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+ else
+ a->l0.i_cost8x16 = COST_MAX;
+
+ if( a->l0.i_cost8x8 <= thresh )
+ {
+ h->mb.i_type = P_8x8;
+ h->mb.i_partition = D_8x8;
+ if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
+ {
+ x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
+ x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
+ x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
+ x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
+ /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
+ * for future blocks are those left over from previous RDO calls. */
+ for( int i = 0; i < 4; i++ )
+ {
+ int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
+ int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
+ int subtype, btype = D_L0_8x8;
+ uint64_t bcost = COST_MAX64;
+ for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
+ {
+ uint64_t cost;
+ if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
+ continue;
+ h->mb.i_sub_partition[i] = subtype;
+ x264_mb_cache_mv_p8x8( h, a, i );
+ cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
+ COPY2_IF_LT( bcost, cost, btype, subtype );
+ }
+ if( h->mb.i_sub_partition[i] != btype )
+ {
+ h->mb.i_sub_partition[i] = btype;
+ x264_mb_cache_mv_p8x8( h, a, i );
+ }
+ }
+ }
+ else
+ x264_analyse_update_cache( h, a );
+ a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+ else
+ a->l0.i_cost8x8 = COST_MAX;
+}
+
+static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
+{
+ int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
+
+ if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
+ {
+ h->mb.i_type = B_DIRECT;
+ /* Assumes direct/skip MC is still in fdec */
+ /* Requires b-rdo to be done before intra analysis */
+ h->mb.b_skip_mc = 1;
+ x264_analyse_update_cache( h, a );
+ a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
+ h->mb.b_skip_mc = 0;
+ }
+
+ //FIXME not all the update_cache calls are needed
+ h->mb.i_partition = D_16x16;
+ /* L0 */
+ if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
+ {
+ h->mb.i_type = B_L0_L0;
+ x264_analyse_update_cache( h, a );
+ a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+
+ /* L1 */
+ if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
+ {
+ h->mb.i_type = B_L1_L1;
+ x264_analyse_update_cache( h, a );
+ a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+
+ /* BI */
+ if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
+ {
+ h->mb.i_type = B_BI_BI;
+ x264_analyse_update_cache( h, a );
+ a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
+ }
+
+ /* 8x8 */
+ if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
+ {
+ h->mb.i_type = B_8x8;
+ h->mb.i_partition = D_8x8;
+ x264_analyse_update_cache( h, a );
+ a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
+ x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
+ }
+
+ /* 16x8 */
+ if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
+ {
+ h->mb.i_type = a->i_mb_type16x8;
+ h->mb.i_partition = D_16x8;
+ x264_analyse_update_cache( h, a );
+ a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );