+ for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
+ CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
+
+ for( i = 0; i < 4; i++ )
+ {
+ x264_me_t *l0m = &a->l0.me8x8[i];
+ const int x8 = i%2;
+ const int y8 = i/2;
+
+ m.i_pixel = PIXEL_8x8;
+
+ LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
+ l0m->cost = INT_MAX;
+ for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
+ {
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ i_halfpel_thresh -= i_ref_cost;
+ m.i_ref_cost = i_ref_cost;
+
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
+ x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
+ x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+ x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
+
+ m.cost += i_ref_cost;
+ i_halfpel_thresh += i_ref_cost;
+ CP32( a->l0.mvc[i_ref][i+1], m.mv );
+
+ if( m.cost < l0m->cost )
+ h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
+ }
+ x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
+ x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
+
+ /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
+ are effectively zero. */
+ if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+ l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+ }
+
+ a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
+ a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+ /* P_8x8 ref0 has no ref cost */
+ if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
+ a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
+ a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
+ h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+ h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
+}
+
+static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
+{
+ const int i_ref = a->l0.me16x16.i_ref;
+ const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
+ uint8_t **p_fenc = h->mb.pic.p_fenc;
+ int i_mvc;
+ int16_t (*mvc)[2] = a->l0.mvc[i_ref];
+ int i;
+
+ /* XXX Needed for x264_mb_predict_mv */
+ h->mb.i_partition = D_8x8;
+
+ i_mvc = 1;
+ CP32( mvc[0], a->l0.me16x16.mv );
+
+ for( i = 0; i < 4; i++ )
+ {
+ x264_me_t *m = &a->l0.me8x8[i];
+ const int x8 = i%2;
+ const int y8 = i/2;
+
+ m->i_pixel = PIXEL_8x8;
+ m->i_ref_cost = i_ref_cost;
+
+ LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
+ LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+ LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
+ x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
+ x264_me_search( h, m, mvc, i_mvc );
+
+ x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
+
+ CP32( mvc[i_mvc], m->mv );
+ i_mvc++;
+
+ /* mb type cost */
+ m->cost += i_ref_cost;
+ if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+ m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+ }
+
+ a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
+ a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+ /* theoretically this should include 4*ref_cost,
+ * but 3 seems a better approximation of cabac. */
+ if( h->param.b_cabac )
+ a->l0.i_cost8x8 -= i_ref_cost;
+ h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+ h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
+}
+
+static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
+{
+ x264_me_t m;
+ uint8_t **p_fenc = h->mb.pic.p_fenc;
+ ALIGNED_4( int16_t mvc[3][2] );
+ int i, j;
+
+ /* XXX Needed for x264_mb_predict_mv */
+ h->mb.i_partition = D_16x8;
+
+ for( i = 0; i < 2; i++ )
+ {
+ x264_me_t *l0m = &a->l0.me16x8[i];
+ const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
+ const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
+
+ m.i_pixel = PIXEL_16x8;
+
+ LOAD_FENC( &m, p_fenc, 0, 8*i );
+ l0m->cost = INT_MAX;
+ for( j = 0; j < i_ref8s; j++ )
+ {
+ const int i_ref = ref8[j];
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ m.i_ref_cost = i_ref_cost;
+
+ /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
+ CP32( mvc[0], a->l0.mvc[i_ref][0] );
+ CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
+ CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
+
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
+
+ x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
+ x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
+ x264_me_search( h, &m, mvc, 3 );
+
+ m.cost += i_ref_cost;
+
+ if( m.cost < l0m->cost )
+ h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
+ }
+ x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
+ x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
+ }
+
+ a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
+}
+
+static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
+{
+ x264_me_t m;
+ uint8_t **p_fenc = h->mb.pic.p_fenc;
+ ALIGNED_4( int16_t mvc[3][2] );
+ int i, j;
+
+ /* XXX Needed for x264_mb_predict_mv */
+ h->mb.i_partition = D_8x16;
+
+ for( i = 0; i < 2; i++ )
+ {
+ x264_me_t *l0m = &a->l0.me8x16[i];
+ const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
+ const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
+
+ m.i_pixel = PIXEL_8x16;
+
+ LOAD_FENC( &m, p_fenc, 8*i, 0 );
+ l0m->cost = INT_MAX;
+ for( j = 0; j < i_ref8s; j++ )
+ {
+ const int i_ref = ref8[j];
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ m.i_ref_cost = i_ref_cost;
+
+ CP32( mvc[0], a->l0.mvc[i_ref][0] );
+ CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
+ CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
+
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
+ LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
+
+ x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
+ x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+ x264_me_search( h, &m, mvc, 3 );
+
+ m.cost += i_ref_cost;
+
+ if( m.cost < l0m->cost )
+ h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
+ }
+ x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
+ x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
+ }
+
+ a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
+}
+
+static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
+{
+ ALIGNED_8( uint8_t pix1[16*8] );
+ uint8_t *pix2 = pix1+8;
+ const int i_stride = h->mb.pic.i_stride[1];
+ const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
+ const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
+ const int i_ref = a->l0.me8x8[i8x8].i_ref;
+ const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ x264_weight_t *weight = h->sh.weight[i_ref];
+
+#define CHROMA4x4MC( width, height, me, x, y ) \
+ h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+ if( weight[1].weightfn ) \
+ weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+ h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+ if( weight[2].weightfn ) \
+ weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
+
+
+ if( pixel == PIXEL_4x4 )
+ {
+ x264_me_t *m = a->l0.me4x4[i8x8];
+ CHROMA4x4MC( 2,2, m[0], 0,0 );
+ CHROMA4x4MC( 2,2, m[1], 2,0 );
+ CHROMA4x4MC( 2,2, m[2], 0,2 );
+ CHROMA4x4MC( 2,2, m[3], 2,2 );
+ }
+ else if( pixel == PIXEL_8x4 )
+ {
+ x264_me_t *m = a->l0.me8x4[i8x8];
+ CHROMA4x4MC( 4,2, m[0], 0,0 );
+ CHROMA4x4MC( 4,2, m[1], 0,2 );
+ }
+ else
+ {
+ x264_me_t *m = a->l0.me4x8[i8x8];
+ CHROMA4x4MC( 2,4, m[0], 0,0 );
+ CHROMA4x4MC( 2,4, m[1], 2,0 );
+ }
+
+ return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
+ + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
+}
+
+static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
+{
+ uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+ uint8_t **p_fenc = h->mb.pic.p_fenc;
+ const int i_ref = a->l0.me8x8[i8x8].i_ref;
+ int i4x4;
+
+ /* XXX Needed for x264_mb_predict_mv */
+ h->mb.i_partition = D_8x8;
+
+ for( i4x4 = 0; i4x4 < 4; i4x4++ )
+ {
+ const int idx = 4*i8x8 + i4x4;
+ const int x4 = block_idx_x[idx];
+ const int y4 = block_idx_y[idx];
+ const int i_mvc = (i4x4 == 0);
+
+ x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
+
+ m->i_pixel = PIXEL_4x4;
+
+ LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
+ LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+ LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
+
+ x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
+ x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
+
+ x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
+ }