git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 x264 project
   5  * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Loren Merritt <lorenm@u.washington.edu>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include <stdio.h>
  26 #include <string.h>
  27 #include <math.h>
  28 #include <limits.h>
  29
  30 #include "common/common.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int i_ref;
  41     x264_me_t me16x16;
  42
  43     /* 8x8 */
  44     int       i_cost8x8;
  45     int       mvc[16][5][2]; /* [ref][0] is 16x16 mv,
  46                                 [ref][1..4] are 8x8 mv from partition [0..3] */
  47     x264_me_t me8x8[4];
  48
  49     /* Sub 4x4 */
  50     int       i_cost4x4[4]; /* cost per 8x8 partition */
  51     x264_me_t me4x4[4][4];
  52
  53     /* Sub 8x4 */
  54     int       i_cost8x4[4]; /* cost per 8x8 partition */
  55     x264_me_t me8x4[4][2];
  56
  57     /* Sub 4x8 */
  58     int       i_cost4x8[4]; /* cost per 8x8 partition */
  59     x264_me_t me4x8[4][4];
  60
  61     /* 16x8 */
  62     int       i_cost16x8;
  63     x264_me_t me16x8[2];
  64
  65     /* 8x16 */
  66     int       i_cost8x16;
  67     x264_me_t me8x16[2];
  68
  69 } x264_mb_analysis_list_t;
  70
  71 typedef struct
  72 {
  73     /* conduct the analysis using this lamda and QP */
  74     int i_lambda;
  75     int i_lambda2;
  76     int i_qp;
  77     int16_t *p_cost_mv;
  78     int b_mbrd;
  79
  80
  81     /* I: Intra part */
  82     /* Take some shortcuts in intra search if intra is deemed unlikely */
  83     int b_fast_intra;
  84     int i_best_satd;
  85     int b_try_pskip;
  86
  87     /* Luma part */
  88     int i_sad_i16x16;
  89     int i_predict16x16;
  90
  91     int i_sad_i8x8;
  92     int i_predict8x8[2][2];
  93
  94     int i_sad_i4x4;
  95     int i_predict4x4[4][4];
  96
  97     /* Chroma part */
  98     int i_sad_i8x8chroma;
  99     int i_predict8x8chroma;
 100
 101     /* II: Inter part P/B frame */
 102     x264_mb_analysis_list_t l0;
 103     x264_mb_analysis_list_t l1;
 104
 105     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 106     int i_cost16x16direct;
 107     int i_cost8x8bi;
 108     int i_cost8x8direct[4];
 109     int i_cost16x8bi;
 110     int i_cost8x16bi;
 111
 112     int i_mb_partition16x8[2]; /* mb_partition_e */
 113     int i_mb_partition8x16[2];
 114     int i_mb_type16x8; /* mb_class_e */
 115     int i_mb_type8x16;
 116
 117     int b_direct_available;
 118
 119 } x264_mb_analysis_t;
 120
 121 /* lambda = pow(2,qp/6-2) */
 122 static const int i_qp0_cost_table[52] = {
 123    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 124    1, 1, 1, 1,              /*  8-11 */
 125    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 126    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 127    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 128   16,18,20,23,25,29,32,36,  /* 36-43 */
 129   40,45,51,57,64,72,81,91   /* 44-51 */
 130 };
 131
 132 /* pow(lambda,2) * .9 */
 133 static const int i_qp0_cost2_table[52] = {
 134    1,   1,   1,   1,   1,   1, /*  0-5  */
 135    1,   1,   1,   1,   1,   1, /*  6-11 */
 136    1,   1,   1,   2,   2,   3, /* 12-17 */
 137    4,   5,   6,   7,   9,  11, /* 18-23 */
 138   14,  18,  23,  29,  36,  46, /* 24-29 */
 139   58,  73,  91, 115, 145, 183, /* 30-35 */
 140  230, 290, 366, 461, 581, 731, /* 36-41 */
 141  922,1161,1463,1843,2322,2926, /* 42-47 */
 142 3686,4645,5852,7373
 143 };
 144
 145 /* TODO: calculate CABAC costs */
 146 static const int i_mb_b_cost_table[19] = {
 147     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 148 };
 149 static const int i_mb_b16x8_cost_table[17] = {
 150     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 151 };
 152 static const int i_sub_mb_b_cost_table[13] = {
 153     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 154 };
 155 static const int i_sub_mb_p_cost_table[4] = {
 156     5, 3, 3, 1
 157 };
 158
 159 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 160
 161 /* initialize an array of lambda*nbits for all possible mvs */
 162 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 163 {
 164     static int16_t *p_cost_mv[52];
 165
 166     if( !p_cost_mv[a->i_qp] )
 167     {
 168         /* could be faster, but isn't called many times */
 169         /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 170         int i;
 171         p_cost_mv[a->i_qp] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
 172         p_cost_mv[a->i_qp] += 2*4*2048;
 173         for( i = 0; i <= 2*4*2048; i++ )
 174         {
 175             p_cost_mv[a->i_qp][-i] =
 176             p_cost_mv[a->i_qp][i]  = a->i_lambda * bs_size_se( i );
 177         }
 178     }
 179
 180     a->p_cost_mv = p_cost_mv[a->i_qp];
 181 }
 182
 183 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 184 {
 185     memset( a, 0, sizeof( x264_mb_analysis_t ) );
 186
 187     /* conduct the analysis using this lamda and QP */
 188     a->i_qp = h->mb.i_qp = i_qp;
 189     a->i_lambda = i_qp0_cost_table[i_qp];
 190     a->i_lambda2 = i_qp0_cost2_table[i_qp];
 191     a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 &&
 192                 ( h->sh.i_type != SLICE_TYPE_B || h->param.analyse.b_bframe_rdo );
 193
 194     h->mb.i_me_method = h->param.analyse.i_me_method;
 195     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 196     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 197                         && h->mb.i_subpel_refine >= 5;
 198     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->b_mbrd;
 199     h->mb.b_transform_8x8 = 0;
 200     h->mb.b_noise_reduction = 0;
 201
 202     /* I: Intra part */
 203     a->i_sad_i16x16 =
 204     a->i_sad_i8x8   =
 205     a->i_sad_i4x4   =
 206     a->i_sad_i8x8chroma = COST_MAX;
 207
 208     a->b_fast_intra = 0;
 209     a->i_best_satd = COST_MAX;
 210
 211     /* II: Inter part P/B frame */
 212     if( h->sh.i_type != SLICE_TYPE_I )
 213     {
 214         int i;
 215         int i_fmv_range = h->param.analyse.i_mv_range - 16;
 216
 217         /* Calculate max allowed MV range */
 218 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
 219         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 220         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 221         h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
 222         h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 8 );
 223         h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
 224         h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
 225         if( h->mb.i_mb_x == 0)
 226         {
 227             h->mb.mv_min[1] = 4*( -16*h->mb.i_mb_y - 24 );
 228             h->mb.mv_max[1] = 4*( 16*( h->sps->i_mb_height - h->mb.i_mb_y - 1 ) + 24 );
 229             h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
 230             h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y - 1 ) + 8 );
 231             h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
 232             h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
 233         }
 234 #undef CLIP_FMV
 235
 236         a->l0.me16x16.cost =
 237         a->l0.i_cost8x8    = COST_MAX;
 238
 239         for( i = 0; i < 4; i++ )
 240         {
 241             a->l0.i_cost4x4[i] =
 242             a->l0.i_cost8x4[i] =
 243             a->l0.i_cost4x8[i] = COST_MAX;
 244         }
 245
 246         a->l0.i_cost16x8   =
 247         a->l0.i_cost8x16   = COST_MAX;
 248         if( h->sh.i_type == SLICE_TYPE_B )
 249         {
 250             a->l1.me16x16.cost =
 251             a->l1.i_cost8x8    = COST_MAX;
 252
 253             for( i = 0; i < 4; i++ )
 254             {
 255                 a->l1.i_cost4x4[i] =
 256                 a->l1.i_cost8x4[i] =
 257                 a->l1.i_cost4x8[i] =
 258                 a->i_cost8x8direct[i] = COST_MAX;
 259             }
 260
 261             a->l1.i_cost16x8   =
 262             a->l1.i_cost8x16   =
 263
 264             a->i_cost16x16bi   =
 265             a->i_cost16x16direct =
 266             a->i_cost8x8bi     =
 267             a->i_cost16x8bi    =
 268             a->i_cost8x16bi    = COST_MAX;
 269         }
 270
 271         /* Fast intra decision */
 272         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 273         {
 274             if( a->b_mbrd
 275                || IS_INTRA( h->mb.i_mb_type_left )
 276                || IS_INTRA( h->mb.i_mb_type_top )
 277                || IS_INTRA( h->mb.i_mb_type_topleft )
 278                || IS_INTRA( h->mb.i_mb_type_topright )
 279                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 280                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 281             { /* intra is likely */ }
 282             else
 283             {
 284                 a->b_fast_intra = 1;
 285             }
 286         }
 287     }
 288 }
 289
 290
 291
 292 /*
 293  * Handle intra mb
 294  */
 295 /* Max = 4 */
 296 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 297 {
 298     if( i_neighbour & MB_TOPLEFT )
 299     {
 300         /* top and left avaible */
 301         *mode++ = I_PRED_16x16_V;
 302         *mode++ = I_PRED_16x16_H;
 303         *mode++ = I_PRED_16x16_DC;
 304         *mode++ = I_PRED_16x16_P;
 305         *pi_count = 4;
 306     }
 307     else if( i_neighbour & MB_LEFT )
 308     {
 309         /* left available*/
 310         *mode++ = I_PRED_16x16_DC_LEFT;
 311         *mode++ = I_PRED_16x16_H;
 312         *pi_count = 2;
 313     }
 314     else if( i_neighbour & MB_TOP )
 315     {
 316         /* top available*/
 317         *mode++ = I_PRED_16x16_DC_TOP;
 318         *mode++ = I_PRED_16x16_V;
 319         *pi_count = 2;
 320     }
 321     else
 322     {
 323         /* none avaible */
 324         *mode = I_PRED_16x16_DC_128;
 325         *pi_count = 1;
 326     }
 327 }
 328
 329 /* Max = 4 */
 330 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 331 {
 332     if( i_neighbour & MB_TOPLEFT )
 333     {
 334         /* top and left avaible */
 335         *mode++ = I_PRED_CHROMA_V;
 336         *mode++ = I_PRED_CHROMA_H;
 337         *mode++ = I_PRED_CHROMA_DC;
 338         *mode++ = I_PRED_CHROMA_P;
 339         *pi_count = 4;
 340     }
 341     else if( i_neighbour & MB_LEFT )
 342     {
 343         /* left available*/
 344         *mode++ = I_PRED_CHROMA_DC_LEFT;
 345         *mode++ = I_PRED_CHROMA_H;
 346         *pi_count = 2;
 347     }
 348     else if( i_neighbour & MB_TOP )
 349     {
 350         /* top available*/
 351         *mode++ = I_PRED_CHROMA_DC_TOP;
 352         *mode++ = I_PRED_CHROMA_V;
 353         *pi_count = 2;
 354     }
 355     else
 356     {
 357         /* none avaible */
 358         *mode = I_PRED_CHROMA_DC_128;
 359         *pi_count = 1;
 360     }
 361 }
 362
 363 /* MAX = 9 */
 364 static void predict_4x4_mode_available( unsigned int i_neighbour,
 365                                         int *mode, int *pi_count )
 366 {
 367     int b_l = i_neighbour & MB_LEFT;
 368     int b_t = i_neighbour & MB_TOP;
 369
 370     if( b_l && b_t )
 371     {
 372         *pi_count = 6;
 373         *mode++ = I_PRED_4x4_DC;
 374         *mode++ = I_PRED_4x4_H;
 375         *mode++ = I_PRED_4x4_V;
 376         *mode++ = I_PRED_4x4_DDL;
 377         if( i_neighbour & MB_TOPLEFT )
 378         {
 379             *mode++ = I_PRED_4x4_DDR;
 380             *mode++ = I_PRED_4x4_VR;
 381             *mode++ = I_PRED_4x4_HD;
 382             *pi_count += 3;
 383         }
 384         *mode++ = I_PRED_4x4_VL;
 385         *mode++ = I_PRED_4x4_HU;
 386     }
 387     else if( b_l )
 388     {
 389         *mode++ = I_PRED_4x4_DC_LEFT;
 390         *mode++ = I_PRED_4x4_H;
 391         *mode++ = I_PRED_4x4_HU;
 392         *pi_count = 3;
 393     }
 394     else if( b_t )
 395     {
 396         *mode++ = I_PRED_4x4_DC_TOP;
 397         *mode++ = I_PRED_4x4_V;
 398         *mode++ = I_PRED_4x4_DDL;
 399         *mode++ = I_PRED_4x4_VL;
 400         *pi_count = 4;
 401     }
 402     else
 403     {
 404         *mode++ = I_PRED_4x4_DC_128;
 405         *pi_count = 1;
 406     }
 407 }
 408
 409 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 410 {
 411     int i;
 412
 413     int i_max;
 414     int predict_mode[9];
 415
 416     uint8_t *p_dstc[2], *p_srcc[2];
 417
 418     if( a->i_sad_i8x8chroma < COST_MAX )
 419         return;
 420
 421     /* 8x8 prediction selection for chroma */
 422     p_dstc[0] = h->mb.pic.p_fdec[1];
 423     p_dstc[1] = h->mb.pic.p_fdec[2];
 424     p_srcc[0] = h->mb.pic.p_fenc[1];
 425     p_srcc[1] = h->mb.pic.p_fenc[2];
 426
 427     predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 428     a->i_sad_i8x8chroma = COST_MAX;
 429     for( i = 0; i < i_max; i++ )
 430     {
 431         int i_sad;
 432         int i_mode;
 433
 434         i_mode = predict_mode[i];
 435
 436         /* we do the prediction */
 437         h->predict_8x8c[i_mode]( p_dstc[0] );
 438         h->predict_8x8c[i_mode]( p_dstc[1] );
 439
 440         /* we calculate the cost */
 441         i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
 442                                           p_srcc[0], FENC_STRIDE ) +
 443                 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
 444                                           p_srcc[1], FENC_STRIDE ) +
 445                 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 446
 447         /* if i_score is lower it is better */
 448         if( a->i_sad_i8x8chroma > i_sad )
 449         {
 450             a->i_predict8x8chroma = i_mode;
 451             a->i_sad_i8x8chroma   = i_sad;
 452         }
 453     }
 454
 455     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 456 }
 457
 458 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_inter )
 459 {
 460     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 461     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 462     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 463     int      f8_satd_rd_ratio = 0;
 464
 465     int i, idx;
 466     int i_max;
 467     int predict_mode[9];
 468     int i_satd_thresh;
 469
 470     if( h->sh.i_type == SLICE_TYPE_B )
 471         i_satd_thresh = a->i_best_satd * 9/8;
 472     else
 473         i_satd_thresh = a->i_best_satd * 5/4 + a->i_lambda * 10;
 474
 475     /*---------------- Try all mode and calculate their score ---------------*/
 476
 477     /* 16x16 prediction selection */
 478     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 479     for( i = 0; i < i_max; i++ )
 480     {
 481         int i_sad;
 482         int i_mode;
 483
 484         i_mode = predict_mode[i];
 485         h->predict_16x16[i_mode]( p_dst );
 486
 487         i_sad = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 488                 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 489         if( a->i_sad_i16x16 > i_sad )
 490         {
 491             a->i_predict16x16 = i_mode;
 492             a->i_sad_i16x16   = i_sad;
 493         }
 494     }
 495
 496     if( a->b_mbrd )
 497     {
 498         f8_satd_rd_ratio = ((unsigned)i_cost_inter << 8) / a->i_best_satd + 1;
 499         x264_mb_analyse_intra_chroma( h, a );
 500         if( h->mb.b_chroma_me )
 501             a->i_sad_i16x16 += a->i_sad_i8x8chroma;
 502         if( a->i_sad_i16x16 < i_satd_thresh )
 503         {
 504             h->mb.i_type = I_16x16;
 505             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
 506             a->i_sad_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 507         }
 508         else
 509             a->i_sad_i16x16 = a->i_sad_i16x16 * f8_satd_rd_ratio >> 8;
 510     }
 511     else
 512     {
 513         if( h->sh.i_type == SLICE_TYPE_B )
 514             /* cavlc mb type prefix */
 515             a->i_sad_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 516         if( a->b_fast_intra && a->i_sad_i16x16 > 2*i_cost_inter )
 517             return;
 518     }
 519
 520     /* 4x4 prediction selection */
 521     if( flags & X264_ANALYSE_I4x4 )
 522     {
 523         a->i_sad_i4x4 = 0;
 524         for( idx = 0; idx < 16; idx++ )
 525         {
 526             uint8_t *p_src_by;
 527             uint8_t *p_dst_by;
 528             int     i_best;
 529             int x, y;
 530             int i_pred_mode;
 531
 532             i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
 533             x = block_idx_x[idx];
 534             y = block_idx_y[idx];
 535
 536             p_src_by = p_src + 4 * x + 4 * y * FENC_STRIDE;
 537             p_dst_by = p_dst + 4 * x + 4 * y * FDEC_STRIDE;
 538
 539             i_best = COST_MAX;
 540             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 541
 542             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 543                 /* emulate missing topright samples */
 544                 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 545
 546             for( i = 0; i < i_max; i++ )
 547             {
 548                 int i_sad;
 549                 int i_mode;
 550
 551                 i_mode = predict_mode[i];
 552                 h->predict_4x4[i_mode]( p_dst_by );
 553
 554                 i_sad = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
 555                                                   p_src_by, FENC_STRIDE )
 556                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 557
 558                 if( i_best > i_sad )
 559                 {
 560                     a->i_predict4x4[x][y] = i_mode;
 561                     i_best = i_sad;
 562                 }
 563             }
 564             a->i_sad_i4x4 += i_best;
 565
 566             /* we need to encode this block now (for next ones) */
 567             h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by );
 568             x264_mb_encode_i4x4( h, idx, a->i_qp );
 569
 570             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
 571         }
 572
 573         a->i_sad_i4x4 += a->i_lambda * 24;    /* from JVT (SATD0) */
 574         if( a->b_mbrd )
 575         {
 576             if( h->mb.b_chroma_me )
 577                 a->i_sad_i4x4 += a->i_sad_i8x8chroma;
 578             if( a->i_sad_i4x4 < i_satd_thresh )
 579             {
 580                 h->mb.i_type = I_4x4;
 581                 a->i_sad_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 582             }
 583             else
 584                 a->i_sad_i4x4 = a->i_sad_i4x4 * f8_satd_rd_ratio >> 8;
 585         }
 586         else
 587         {
 588             if( h->sh.i_type == SLICE_TYPE_B )
 589                 a->i_sad_i4x4 += a->i_lambda * i_mb_b_cost_table[I_4x4];
 590         }
 591     }
 592
 593     /* 8x8 prediction selection */
 594     if( flags & X264_ANALYSE_I8x8 )
 595     {
 596         a->i_sad_i8x8 = 0;
 597         for( idx = 0; idx < 4; idx++ )
 598         {
 599             uint8_t *p_src_by;
 600             uint8_t *p_dst_by;
 601             int     i_best;
 602             int x, y;
 603             int i_pred_mode;
 604
 605             i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
 606             x = idx&1;
 607             y = idx>>1;
 608
 609             p_src_by = p_src + 8 * x + 8 * y * FENC_STRIDE;
 610             p_dst_by = p_dst + 8 * x + 8 * y * FDEC_STRIDE;
 611
 612             i_best = COST_MAX;
 613             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
 614             for( i = 0; i < i_max; i++ )
 615             {
 616                 int i_sad;
 617                 int i_mode;
 618
 619                 i_mode = predict_mode[i];
 620                 h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
 621
 622                 /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
 623                 i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, FDEC_STRIDE,
 624                                                   p_src_by, FENC_STRIDE )
 625                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 626
 627                 if( i_best > i_sad )
 628                 {
 629                     a->i_predict8x8[x][y] = i_mode;
 630                     i_best = i_sad;
 631                 }
 632             }
 633             a->i_sad_i8x8 += i_best;
 634
 635             /* we need to encode this block now (for next ones) */
 636             h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, h->mb.i_neighbour8[idx] );
 637             x264_mb_encode_i8x8( h, idx, a->i_qp );
 638
 639             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
 640         }
 641
 642         if( a->b_mbrd )
 643         {
 644             if( h->mb.b_chroma_me )
 645                 a->i_sad_i8x8 += a->i_sad_i8x8chroma;
 646             if( a->i_sad_i8x8 < i_satd_thresh )
 647             {
 648                 h->mb.i_type = I_8x8;
 649                 a->i_sad_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 650             }
 651             else
 652                 a->i_sad_i8x8 = a->i_sad_i8x8 * f8_satd_rd_ratio >> 8;
 653         }
 654         else
 655         {
 656             // FIXME some bias like in i4x4?
 657             if( h->sh.i_type == SLICE_TYPE_B )
 658                 a->i_sad_i8x8 += a->i_lambda * i_mb_b_cost_table[I_8x8];
 659         }
 660     }
 661 }
 662
 663 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 664 {
 665     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 666     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 667
 668     int i, idx, x, y;
 669     int i_max, i_sad, i_best, i_mode;
 670     int i_pred_mode;
 671     int predict_mode[9];
 672
 673     if( h->mb.i_type == I_16x16 )
 674     {
 675         int old_pred_mode = a->i_predict16x16;
 676         i_best = a->i_sad_i16x16;
 677         predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 678         for( i = 0; i < i_max; i++ )
 679         {
 680             if( predict_mode[i] == old_pred_mode )
 681                 continue;
 682             h->mb.i_intra16x16_pred_mode = predict_mode[i];
 683             i_sad = x264_rd_cost_mb( h, a->i_lambda2 );
 684             if( i_best > i_sad )
 685             {
 686                 a->i_predict16x16 = predict_mode[i];
 687                 i_best = i_sad;
 688             }
 689         }
 690     }
 691     else if( h->mb.i_type == I_4x4 )
 692     {
 693         for( idx = 0; idx < 16; idx++ )
 694         {
 695             uint32_t pels[4];
 696             int i_nnz = 0;
 697             uint8_t *p_src_by;
 698             uint8_t *p_dst_by;
 699             i_best = COST_MAX;
 700
 701             i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
 702             x = block_idx_x[idx];
 703             y = block_idx_y[idx];
 704
 705             p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
 706             p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
 707             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 708
 709             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 710                 /* emulate missing topright samples */
 711                 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 712
 713             for( i = 0; i < i_max; i++ )
 714             {
 715                 i_mode = predict_mode[i];
 716                 h->predict_4x4[i_mode]( p_dst_by );
 717
 718                 i_sad = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
 719
 720                 if( i_best > i_sad )
 721                 {
 722                     a->i_predict4x4[x][y] = i_mode;
 723                     i_best = i_sad;
 724                     pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
 725                     pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
 726                     pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
 727                     pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
 728                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
 729                 }
 730             }
 731
 732             *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
 733             *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
 734             *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
 735             *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
 736             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
 737
 738             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
 739         }
 740     }
 741     else if( h->mb.i_type == I_8x8 )
 742     {
 743         for( idx = 0; idx < 4; idx++ )
 744         {
 745             uint64_t pels_h = 0;
 746             uint8_t pels_v[7];
 747             int i_nnz[3];
 748             uint8_t *p_src_by;
 749             uint8_t *p_dst_by;
 750             int j;
 751             i_best = COST_MAX;
 752
 753             i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
 754             x = idx&1;
 755             y = idx>>1;
 756
 757             p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 758             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 759             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
 760             for( i = 0; i < i_max; i++ )
 761             {
 762                 i_mode = predict_mode[i];
 763                 h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
 764
 765                 i_sad = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
 766
 767                 if( i_best > i_sad )
 768                 {
 769                     a->i_predict8x8[x][y] = i_mode;
 770                     i_best = i_sad;
 771
 772                     pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
 773                     if( !(idx&1) )
 774                         for( j=0; j<7; j++ )
 775                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
 776                     for( j=0; j<3; j++ )
 777                         i_nnz[j] = h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]];
 778                 }
 779             }
 780
 781             *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
 782             if( !(idx&1) )
 783                 for( j=0; j<7; j++ )
 784                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
 785             for( j=0; j<3; j++ )
 786                 h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]] = i_nnz[j];
 787
 788             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
 789         }
 790     }
 791 }
 792
 793 #define LOAD_FENC( m, src, xoff, yoff) \
 794     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
 795     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
 796     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
 797     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
 798     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
 799
 800 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
 801     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 802     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
 803     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
 804     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
 805     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 806     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 807     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
 808
 809 #define REF_COST(list, ref) \
 810     (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref ))
 811
 812 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 813 {
 814     x264_me_t m;
 815     int i_ref;
 816     int mvc[7][2], i_mvc;
 817     int i_halfpel_thresh = INT_MAX;
 818     int *p_halfpel_thresh = h->i_ref0>1 ? &i_halfpel_thresh : NULL;
 819
 820     /* 16x16 Search on all ref frame */
 821     m.i_pixel = PIXEL_16x16;
 822     m.p_cost_mv = a->p_cost_mv;
 823     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 824
 825     a->l0.me16x16.cost = INT_MAX;
 826     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 827     {
 828         const int i_ref_cost = REF_COST( 0, i_ref );
 829         i_halfpel_thresh -= i_ref_cost;
 830         m.i_ref_cost = i_ref_cost;
 831         m.i_ref = i_ref;
 832
 833         /* search with ref */
 834         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
 835         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 836         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 837         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 838
 839         /* early termination
 840          * SSD threshold would probably be better than SATD */
 841         if( i_ref == 0 && a->b_try_pskip && m.cost-m.cost_mv < 300*a->i_lambda )
 842         {
 843             int mvskip[2];
 844             x264_mb_predict_mv_pskip( h, mvskip );
 845             if( abs(m.mv[0]-mvskip[0]) + abs(m.mv[1]-mvskip[1]) <= 1
 846                 && x264_macroblock_probe_pskip( h ) )
 847             {
 848                 h->mb.i_type = P_SKIP;
 849                 x264_analyse_update_cache( h, a );
 850                 return;
 851             }
 852         }
 853
 854         m.cost += i_ref_cost;
 855         i_halfpel_thresh += i_ref_cost;
 856
 857         if( m.cost < a->l0.me16x16.cost )
 858             a->l0.me16x16 = m;
 859
 860         /* save mv for predicting neighbors */
 861         a->l0.mvc[i_ref][0][0] =
 862         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 863         a->l0.mvc[i_ref][0][1] =
 864         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 865     }
 866
 867     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
 868
 869     h->mb.i_type = P_L0;
 870     if( a->b_mbrd )
 871     {
 872         a->i_best_satd = a->l0.me16x16.cost;
 873         h->mb.i_partition = D_16x16;
 874         x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
 875         a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
 876     }
 877 }
 878
 879 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
 880 {
 881     x264_me_t m;
 882     int i_ref;
 883     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 884     int i_halfpel_thresh = INT_MAX;
 885     int *p_halfpel_thresh = /*h->i_ref0>1 ? &i_halfpel_thresh : */NULL;
 886     int i;
 887     int i_maxref = h->i_ref0-1;
 888
 889     h->mb.i_partition = D_8x8;
 890
 891     /* early termination: if 16x16 chose ref 0, then evalute no refs older
 892      * than those used by the neighbors */
 893     if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
 894         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
 895     {
 896         i_maxref = 0;
 897         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
 898         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
 899         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
 900         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
 901         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
 902         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
 903     }
 904
 905     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
 906     {
 907          a->l0.mvc[i_ref][0][0] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0];
 908          a->l0.mvc[i_ref][0][1] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1];
 909     }
 910
 911     for( i = 0; i < 4; i++ )
 912     {
 913         x264_me_t *l0m = &a->l0.me8x8[i];
 914         const int x8 = i%2;
 915         const int y8 = i/2;
 916
 917         m.i_pixel = PIXEL_8x8;
 918         m.p_cost_mv = a->p_cost_mv;
 919
 920         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
 921         l0m->cost = INT_MAX;
 922         for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
 923         {
 924              const int i_ref_cost = REF_COST( 0, i_ref );
 925              i_halfpel_thresh -= i_ref_cost;
 926              m.i_ref_cost = i_ref_cost;
 927              m.i_ref = i_ref;
 928
 929              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
 930              x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
 931              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
 932              x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
 933
 934              m.cost += i_ref_cost;
 935              i_halfpel_thresh += i_ref_cost;
 936              *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv;
 937
 938              if( m.cost < l0m->cost )
 939                  *l0m = m;
 940         }
 941         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv[0], l0m->mv[1] );
 942         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
 943
 944         /* mb type cost */
 945         l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
 946     }
 947
 948     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
 949                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
 950     if( a->b_mbrd )
 951     {
 952         if( a->i_best_satd > a->l0.i_cost8x8 )
 953             a->i_best_satd = a->l0.i_cost8x8;
 954         h->mb.i_type = P_8x8;
 955         h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
 956         h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
 957         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 958     }
 959 }
 960
 961 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 962 {
 963     const int i_ref = a->l0.me16x16.i_ref;
 964     const int i_ref_cost = REF_COST( 0, i_ref );
 965     uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
 966     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 967     int i_mvc;
 968     int (*mvc)[2] = a->l0.mvc[i_ref];
 969     int i;
 970
 971     /* XXX Needed for x264_mb_predict_mv */
 972     h->mb.i_partition = D_8x8;
 973
 974     i_mvc = 1;
 975     *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.me16x16.mv;
 976
 977     for( i = 0; i < 4; i++ )
 978     {
 979         x264_me_t *m = &a->l0.me8x8[i];
 980         const int x8 = i%2;
 981         const int y8 = i/2;
 982
 983         m->i_pixel = PIXEL_8x8;
 984         m->p_cost_mv = a->p_cost_mv;
 985         m->i_ref_cost = i_ref_cost;
 986         m->i_ref = i_ref;
 987
 988         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
 989         LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
 990         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 991         x264_me_search( h, m, mvc, i_mvc );
 992
 993         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 994
 995         *(uint64_t*)mvc[i_mvc] = *(uint64_t*)m->mv;
 996         i_mvc++;
 997
 998         /* mb type cost */
 999         m->cost += i_ref_cost;
1000         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1001     }
1002
1003     /* theoretically this should include 4*ref_cost,
1004      * but 3 seems a better approximation of cabac. */
1005     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1006                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost -
1007                       REF_COST( 0, a->l0.me16x16.i_ref );
1008     if( a->b_mbrd )
1009     {
1010         if( a->i_best_satd > a->l0.i_cost8x8 )
1011             a->i_best_satd = a->l0.i_cost8x8;
1012         h->mb.i_type = P_8x8;
1013         h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1014         h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1015         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1016     }
1017 }
1018
1019 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1020 {
1021     x264_me_t m;
1022     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1023     int mvc[3][2];
1024     int i, j;
1025
1026     /* XXX Needed for x264_mb_predict_mv */
1027     h->mb.i_partition = D_16x8;
1028
1029     for( i = 0; i < 2; i++ )
1030     {
1031         x264_me_t *l0m = &a->l0.me16x8[i];
1032         const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1033         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1034
1035         m.i_pixel = PIXEL_16x8;
1036         m.p_cost_mv = a->p_cost_mv;
1037
1038         LOAD_FENC( &m, p_fenc, 0, 8*i );
1039         l0m->cost = INT_MAX;
1040         for( j = 0; j < i_ref8s; j++ )
1041         {
1042              const int i_ref = ref8[j];
1043              const int i_ref_cost = REF_COST( 0, i_ref );
1044              m.i_ref_cost = i_ref_cost;
1045              m.i_ref = i_ref;
1046
1047              /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1048              *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
1049              *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1];
1050              *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2];
1051
1052              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1053              x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1054              x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1055              x264_me_search( h, &m, mvc, 3 );
1056
1057              m.cost += i_ref_cost;
1058
1059              if( m.cost < l0m->cost )
1060                  *l0m = m;
1061         }
1062         x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, l0m->mv[0], l0m->mv[1] );
1063         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1064     }
1065
1066     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1067     if( a->b_mbrd )
1068     {
1069         if( a->i_best_satd > a->l0.i_cost16x8 )
1070             a->i_best_satd = a->l0.i_cost16x8;
1071         h->mb.i_type = P_L0;
1072         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1073     }
1074 }
1075
1076 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1077 {
1078     x264_me_t m;
1079     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1080     int mvc[3][2];
1081     int i, j;
1082
1083     /* XXX Needed for x264_mb_predict_mv */
1084     h->mb.i_partition = D_8x16;
1085
1086     for( i = 0; i < 2; i++ )
1087     {
1088         x264_me_t *l0m = &a->l0.me8x16[i];
1089         const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1090         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1091
1092         m.i_pixel = PIXEL_8x16;
1093         m.p_cost_mv = a->p_cost_mv;
1094
1095         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1096         l0m->cost = INT_MAX;
1097         for( j = 0; j < i_ref8s; j++ )
1098         {
1099              const int i_ref = ref8[j];
1100              const int i_ref_cost = REF_COST( 0, i_ref );
1101              m.i_ref_cost = i_ref_cost;
1102              m.i_ref = i_ref;
1103
1104              *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
1105              *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1];
1106              *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3];
1107
1108              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1109              x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1110              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1111              x264_me_search( h, &m, mvc, 3 );
1112
1113              m.cost += i_ref_cost;
1114
1115              if( m.cost < l0m->cost )
1116                  *l0m = m;
1117         }
1118         x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, l0m->mv[0], l0m->mv[1] );
1119         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1120     }
1121
1122     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1123     if( a->b_mbrd )
1124     {
1125         if( a->i_best_satd > a->l0.i_cost8x16 )
1126             a->i_best_satd = a->l0.i_cost8x16;
1127         h->mb.i_type = P_L0;
1128         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1129     }
1130 }
1131
1132 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1133 {
1134     DECLARE_ALIGNED( uint8_t, pix1[8*8], 8 );
1135     DECLARE_ALIGNED( uint8_t, pix2[8*8], 8 );
1136     const int i_stride = h->mb.pic.i_stride[1];
1137     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1138     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1139
1140 #define CHROMA4x4MC( width, height, me, x, y ) \
1141     h->mc.mc_chroma( &p_fref[4][or+x+y*i_stride], i_stride, &pix1[x+y*8], 8, (me).mv[0], (me).mv[1], width, height ); \
1142     h->mc.mc_chroma( &p_fref[5][or+x+y*i_stride], i_stride, &pix2[x+y*8], 8, (me).mv[0], (me).mv[1], width, height );
1143
1144     if( pixel == PIXEL_4x4 )
1145     {
1146         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1147         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 0,2 );
1148         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 2,0 );
1149         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1150     }
1151     else if( pixel == PIXEL_8x4 )
1152     {
1153         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1154         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1155     }
1156     else
1157     {
1158         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1159         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1160     }
1161
1162     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 8 )
1163          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 8 );
1164 }
1165
1166 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1167 {
1168     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1169     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1170     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1171     int i4x4;
1172
1173     /* XXX Needed for x264_mb_predict_mv */
1174     h->mb.i_partition = D_8x8;
1175
1176     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1177     {
1178         const int idx = 4*i8x8 + i4x4;
1179         const int x4 = block_idx_x[idx];
1180         const int y4 = block_idx_y[idx];
1181         const int i_mvc = (i4x4 == 0);
1182
1183         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1184
1185         m->i_pixel = PIXEL_4x4;
1186         m->p_cost_mv = a->p_cost_mv;
1187
1188         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1189         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1190
1191         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1192         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1193
1194         x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
1195     }
1196     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1197                             a->l0.me4x4[i8x8][1].cost +
1198                             a->l0.me4x4[i8x8][2].cost +
1199                             a->l0.me4x4[i8x8][3].cost +
1200                             REF_COST( 0, i_ref ) +
1201                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1202     if( h->mb.b_chroma_me )
1203         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1204 }
1205
1206 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1207 {
1208     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1209     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1210     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1211     int i8x4;
1212
1213     /* XXX Needed for x264_mb_predict_mv */
1214     h->mb.i_partition = D_8x8;
1215
1216     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1217     {
1218         const int idx = 4*i8x8 + 2*i8x4;
1219         const int x4 = block_idx_x[idx];
1220         const int y4 = block_idx_y[idx];
1221         const int i_mvc = (i8x4 == 0);
1222
1223         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1224
1225         m->i_pixel = PIXEL_8x4;
1226         m->p_cost_mv = a->p_cost_mv;
1227
1228         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1229         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1230
1231         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1232         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1233
1234         x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
1235     }
1236     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1237                             REF_COST( 0, i_ref ) +
1238                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1239     if( h->mb.b_chroma_me )
1240         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1241 }
1242
1243 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1244 {
1245     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1246     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1247     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1248     int i4x8;
1249
1250     /* XXX Needed for x264_mb_predict_mv */
1251     h->mb.i_partition = D_8x8;
1252
1253     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1254     {
1255         const int idx = 4*i8x8 + i4x8;
1256         const int x4 = block_idx_x[idx];
1257         const int y4 = block_idx_y[idx];
1258         const int i_mvc = (i4x8 == 0);
1259
1260         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1261
1262         m->i_pixel = PIXEL_4x8;
1263         m->p_cost_mv = a->p_cost_mv;
1264
1265         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1266         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1267
1268         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1269         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1270
1271         x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
1272     }
1273     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1274                             REF_COST( 0, i_ref ) +
1275                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1276     if( h->mb.b_chroma_me )
1277         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1278 }
1279
1280 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1281 {
1282     /* Assumes that fdec still contains the results of
1283      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1284
1285     uint8_t **p_fenc = h->mb.pic.p_fenc;
1286     uint8_t **p_fdec = h->mb.pic.p_fdec;
1287     int i;
1288
1289     a->i_cost16x16direct = 0;
1290     for( i = 0; i < 4; i++ )
1291     {
1292         const int x = (i&1)*8;
1293         const int y = (i>>1)*8;
1294         a->i_cost16x16direct +=
1295         a->i_cost8x8direct[i] =
1296             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1297
1298         /* mb type cost */
1299         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1300     }
1301     a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1302
1303     if( a->b_mbrd )
1304     {
1305         if( a->i_cost16x16direct < a->i_best_satd )
1306             a->i_best_satd = a->i_cost16x16direct;
1307
1308         h->mb.i_type = B_DIRECT;
1309         a->i_cost16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
1310     }
1311 }
1312
1313 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
1314     { \
1315         if( h->param.analyse.b_weighted_bipred ) \
1316             h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \
1317                     h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1318         else \
1319             h->mc.avg[size]( pix1, stride1, src2, stride2 ); \
1320     }
1321
1322 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1323 {
1324     uint8_t pix1[16*16], pix2[16*16];
1325     uint8_t *src2;
1326     int stride2 = 16;
1327     int weight;
1328
1329     x264_me_t m;
1330     int i_ref;
1331     int mvc[8][2], i_mvc;
1332     int i_halfpel_thresh = INT_MAX;
1333     int *p_halfpel_thresh = h->i_ref0>1 ? &i_halfpel_thresh : NULL;
1334
1335     /* 16x16 Search on all ref frame */
1336     m.i_pixel = PIXEL_16x16;
1337     m.p_cost_mv = a->p_cost_mv;
1338     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1339
1340     /* ME for List 0 */
1341     a->l0.me16x16.cost = INT_MAX;
1342     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
1343     {
1344         /* search with ref */
1345         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1346         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1347         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1348         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1349
1350         /* add ref cost */
1351         m.cost += REF_COST( 0, i_ref );
1352
1353         if( m.cost < a->l0.me16x16.cost )
1354         {
1355             a->l0.i_ref = i_ref;
1356             a->l0.me16x16 = m;
1357         }
1358
1359         /* save mv for predicting neighbors */
1360         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
1361         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
1362     }
1363     /* subtract ref cost, so we don't have to add it for the other MB types */
1364     a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1365
1366     /* ME for list 1 */
1367     i_halfpel_thresh = INT_MAX;
1368     p_halfpel_thresh = h->i_ref1>1 ? &i_halfpel_thresh : NULL;
1369     a->l1.me16x16.cost = INT_MAX;
1370     for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
1371     {
1372         /* search with ref */
1373         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1374         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1375         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1376         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1377
1378         /* add ref cost */
1379         m.cost += REF_COST( 1, i_ref );
1380
1381         if( m.cost < a->l1.me16x16.cost )
1382         {
1383             a->l1.i_ref = i_ref;
1384             a->l1.me16x16 = m;
1385         }
1386
1387         /* save mv for predicting neighbors */
1388         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
1389         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
1390     }
1391     /* subtract ref cost, so we don't have to add it for the other MB types */
1392     a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1393
1394     /* Set global ref, needed for other modes? */
1395     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1396     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1397
1398     /* get cost of BI mode */
1399     weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
1400     if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
1401     {
1402         /* l0 reference is halfpel, so get_ref on it will make it faster */
1403         src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1404                         pix2, &stride2,
1405                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
1406                         16, 16 );
1407         h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1408                         pix1, 16,
1409                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1410                         16, 16 );
1411         weight = 64 - weight;
1412     }
1413     else
1414     {
1415         /* if l0 was qpel, we'll use get_ref on l1 instead */
1416         h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1417                         pix1, 16,
1418                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
1419                         16, 16 );
1420         src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1421                         pix2, &stride2,
1422                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1423                         16, 16 );
1424     }
1425
1426     if( h->param.analyse.b_weighted_bipred )
1427         h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight );
1428     else
1429         h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
1430
1431     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 )
1432                      + REF_COST( 0, a->l0.i_ref )
1433                      + REF_COST( 1, a->l1.i_ref )
1434                      + a->l0.me16x16.cost_mv
1435                      + a->l1.me16x16.cost_mv;
1436
1437     /* mb type cost */
1438     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1439     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1440     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1441
1442     if( a->b_mbrd )
1443     {
1444         int i_satd_thresh;
1445
1446         if( a->l0.me16x16.cost < a->i_best_satd )
1447             a->i_best_satd = a->l0.me16x16.cost;
1448         if( a->l1.me16x16.cost < a->i_best_satd )
1449             a->i_best_satd = a->l1.me16x16.cost;
1450         if( a->i_cost16x16bi < a->i_best_satd )
1451             a->i_best_satd = a->i_cost16x16bi;
1452
1453         i_satd_thresh = a->i_best_satd * 3/2;
1454
1455         h->mb.i_partition = D_16x16;
1456         /* L0 */
1457         if( a->l0.me16x16.cost < i_satd_thresh )
1458         {
1459             h->mb.i_type = B_L0_L0;
1460             x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
1461             a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
1462         }
1463         else
1464             a->l0.me16x16.cost = COST_MAX;
1465
1466         /* L1 */
1467         if( a->l1.me16x16.cost < i_satd_thresh )
1468         {
1469             h->mb.i_type = B_L1_L1;
1470             x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
1471             a->l1.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
1472         }
1473         else
1474             a->l1.me16x16.cost = COST_MAX;
1475
1476         /* BI */
1477         if( a->i_cost16x16bi < i_satd_thresh )
1478         {
1479             h->mb.i_type = B_BI_BI;
1480             a->i_cost16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
1481         }
1482         else
1483             a->i_cost16x16bi = COST_MAX;
1484     }
1485 }
1486
1487 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1488 {
1489     const int x = 2*(i%2);
1490     const int y = 2*(i/2);
1491
1492     switch( h->mb.i_sub_partition[i] )
1493     {
1494         case D_L0_8x8:
1495             x264_macroblock_cache_mv( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] );
1496             break;
1497         case D_L0_8x4:
1498             x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv[0], a->l0.me8x4[i][0].mv[1] );
1499             x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv[0], a->l0.me8x4[i][1].mv[1] );
1500             break;
1501         case D_L0_4x8:
1502             x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv[0], a->l0.me4x8[i][0].mv[1] );
1503             x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv[0], a->l0.me4x8[i][1].mv[1] );
1504             break;
1505         case D_L0_4x4:
1506             x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv[0], a->l0.me4x4[i][0].mv[1] );
1507             x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv[0], a->l0.me4x4[i][1].mv[1] );
1508             x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv[0], a->l0.me4x4[i][2].mv[1] );
1509             x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv[0], a->l0.me4x4[i][3].mv[1] );
1510             break;
1511         default:
1512             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1513             break;
1514     }
1515 }
1516
1517 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1518     if( x264_mb_partition_listX_table[0][part] ) \
1519     { \
1520         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1521         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
1522     } \
1523     else \
1524     { \
1525         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1526         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0, 0 ); \
1527         if( b_mvd ) \
1528             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
1529     } \
1530     if( x264_mb_partition_listX_table[1][part] ) \
1531     { \
1532         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1533         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
1534     } \
1535     else \
1536     { \
1537         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1538         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0, 0 ); \
1539         if( b_mvd ) \
1540             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
1541     }
1542
1543 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1544 {
1545     int x = (i%2)*2;
1546     int y = (i/2)*2;
1547     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1548     {
1549         x264_mb_load_mv_direct8x8( h, i );
1550         if( b_mvd )
1551         {
1552             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0, 0 );
1553             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0, 0 );
1554             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1555         }
1556     }
1557     else
1558     {
1559         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1560     }
1561 }
1562 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1563 {
1564     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1565 }
1566 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1567 {
1568     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1569 }
1570 #undef CACHE_MV_BI
1571
1572 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1573 {
1574     uint8_t **p_fref[2] =
1575         { h->mb.pic.p_fref[0][a->l0.i_ref],
1576           h->mb.pic.p_fref[1][a->l1.i_ref] };
1577     uint8_t pix[2][8*8];
1578     int i, l;
1579
1580     /* XXX Needed for x264_mb_predict_mv */
1581     h->mb.i_partition = D_8x8;
1582
1583     a->i_cost8x8bi = 0;
1584
1585     for( i = 0; i < 4; i++ )
1586     {
1587         const int x8 = i%2;
1588         const int y8 = i/2;
1589         int i_part_cost;
1590         int i_part_cost_bi = 0;
1591
1592         for( l = 0; l < 2; l++ )
1593         {
1594             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1595             x264_me_t *m = &lX->me8x8[i];
1596
1597             m->i_pixel = PIXEL_8x8;
1598             m->p_cost_mv = a->p_cost_mv;
1599
1600             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1601             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1602
1603             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1604             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1605
1606             x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
1607
1608             /* BI mode */
1609             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1610                             m->mv[0], m->mv[1], 8, 8 );
1611             i_part_cost_bi += m->cost_mv;
1612             /* FIXME: ref cost */
1613         }
1614
1615         WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
1616         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1617                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1618         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1619         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1620
1621         i_part_cost = a->l0.me8x8[i].cost;
1622         h->mb.i_sub_partition[i] = D_L0_8x8;
1623         if( a->l1.me8x8[i].cost < i_part_cost )
1624         {
1625             i_part_cost = a->l1.me8x8[i].cost;
1626             h->mb.i_sub_partition[i] = D_L1_8x8;
1627         }
1628         if( i_part_cost_bi < i_part_cost )
1629         {
1630             i_part_cost = i_part_cost_bi;
1631             h->mb.i_sub_partition[i] = D_BI_8x8;
1632         }
1633         if( a->i_cost8x8direct[i] < i_part_cost )
1634         {
1635             i_part_cost = a->i_cost8x8direct[i];
1636             h->mb.i_sub_partition[i] = D_DIRECT_8x8;
1637         }
1638         a->i_cost8x8bi += i_part_cost;
1639
1640         /* XXX Needed for x264_mb_predict_mv */
1641         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1642     }
1643
1644     /* mb type cost */
1645     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1646
1647     if( a->b_mbrd )
1648     {
1649         if( a->i_cost8x8bi < a->i_best_satd )
1650             a->i_best_satd = a->i_cost8x8bi;
1651
1652         if( a->i_cost8x8bi < a->i_best_satd * 3/2 )
1653         {
1654             h->mb.i_type = B_8x8;
1655             h->mb.i_partition = D_8x8;
1656             a->i_cost8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
1657         }
1658         else
1659             a->i_cost8x8bi = COST_MAX;
1660     }
1661 }
1662
1663 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1664 {
1665     uint8_t **p_fref[2] =
1666         { h->mb.pic.p_fref[0][a->l0.i_ref],
1667           h->mb.pic.p_fref[1][a->l1.i_ref] };
1668     DECLARE_ALIGNED( uint8_t,  pix[2][16*8], 16 );
1669     int mvc[2][2];
1670     int i, l;
1671
1672     h->mb.i_partition = D_16x8;
1673     a->i_cost16x8bi = 0;
1674
1675     for( i = 0; i < 2; i++ )
1676     {
1677         int i_part_cost;
1678         int i_part_cost_bi = 0;
1679
1680         /* TODO: check only the list(s) that were used in b8x8? */
1681         for( l = 0; l < 2; l++ )
1682         {
1683             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1684             x264_me_t *m = &lX->me16x8[i];
1685
1686             m->i_pixel = PIXEL_16x8;
1687             m->p_cost_mv = a->p_cost_mv;
1688
1689             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1690             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1691
1692             mvc[0][0] = lX->me8x8[2*i].mv[0];
1693             mvc[0][1] = lX->me8x8[2*i].mv[1];
1694             mvc[1][0] = lX->me8x8[2*i+1].mv[0];
1695             mvc[1][1] = lX->me8x8[2*i+1].mv[1];
1696
1697             x264_mb_predict_mv( h, 0, 8*i, 2, m->mvp );
1698             x264_me_search( h, m, mvc, 2 );
1699
1700             /* BI mode */
1701             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
1702                             m->mv[0], m->mv[1], 16, 8 );
1703             /* FIXME: ref cost */
1704             i_part_cost_bi += m->cost_mv;
1705         }
1706
1707         WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
1708         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1709
1710         i_part_cost = a->l0.me16x8[i].cost;
1711         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1712         if( a->l1.me16x8[i].cost < i_part_cost )
1713         {
1714             i_part_cost = a->l1.me16x8[i].cost;
1715             a->i_mb_partition16x8[i] = D_L1_8x8;
1716         }
1717         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1718         {
1719             i_part_cost = i_part_cost_bi;
1720             a->i_mb_partition16x8[i] = D_BI_8x8;
1721         }
1722         a->i_cost16x8bi += i_part_cost;
1723
1724         x264_mb_cache_mv_b16x8( h, a, i, 0 );
1725     }
1726
1727     /* mb type cost */
1728     a->i_mb_type16x8 = B_L0_L0
1729         + (a->i_mb_partition16x8[0]>>2) * 3
1730         + (a->i_mb_partition16x8[1]>>2);
1731     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1732
1733     if( a->b_mbrd )
1734     {
1735         if( a->i_cost16x8bi < a->i_best_satd )
1736             a->i_best_satd = a->i_cost16x8bi;
1737
1738         if( a->i_cost16x8bi < a->i_best_satd * 3/2 )
1739         {
1740             h->mb.i_type = a->i_mb_type16x8;
1741             h->mb.i_partition = D_16x8;
1742             a->i_cost16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
1743         }
1744         else
1745             a->i_cost16x8bi = COST_MAX;
1746     }
1747 }
1748 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1749 {
1750     uint8_t **p_fref[2] =
1751         { h->mb.pic.p_fref[0][a->l0.i_ref],
1752           h->mb.pic.p_fref[1][a->l1.i_ref] };
1753     uint8_t pix[2][8*16];
1754     int mvc[2][2];
1755     int i, l;
1756
1757     h->mb.i_partition = D_8x16;
1758     a->i_cost8x16bi = 0;
1759
1760     for( i = 0; i < 2; i++ )
1761     {
1762         int i_part_cost;
1763         int i_part_cost_bi = 0;
1764
1765         for( l = 0; l < 2; l++ )
1766         {
1767             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1768             x264_me_t *m = &lX->me8x16[i];
1769
1770             m->i_pixel = PIXEL_8x16;
1771             m->p_cost_mv = a->p_cost_mv;
1772
1773             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1774             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1775
1776             mvc[0][0] = lX->me8x8[i].mv[0];
1777             mvc[0][1] = lX->me8x8[i].mv[1];
1778             mvc[1][0] = lX->me8x8[i+2].mv[0];
1779             mvc[1][1] = lX->me8x8[i+2].mv[1];
1780
1781             x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1782             x264_me_search( h, m, mvc, 2 );
1783
1784             /* BI mode */
1785             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1786                             m->mv[0], m->mv[1], 8, 16 );
1787             /* FIXME: ref cost */
1788             i_part_cost_bi += m->cost_mv;
1789         }
1790
1791         WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
1792         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1793
1794         i_part_cost = a->l0.me8x16[i].cost;
1795         a->i_mb_partition8x16[i] = D_L0_8x8;
1796         if( a->l1.me8x16[i].cost < i_part_cost )
1797         {
1798             i_part_cost = a->l1.me8x16[i].cost;
1799             a->i_mb_partition8x16[i] = D_L1_8x8;
1800         }
1801         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1802         {
1803             i_part_cost = i_part_cost_bi;
1804             a->i_mb_partition8x16[i] = D_BI_8x8;
1805         }
1806         a->i_cost8x16bi += i_part_cost;
1807
1808         x264_mb_cache_mv_b8x16( h, a, i, 0 );
1809     }
1810
1811     /* mb type cost */
1812     a->i_mb_type8x16 = B_L0_L0
1813         + (a->i_mb_partition8x16[0]>>2) * 3
1814         + (a->i_mb_partition8x16[1]>>2);
1815     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1816
1817     if( a->b_mbrd )
1818     {
1819         if( a->i_cost8x16bi < a->i_best_satd )
1820             a->i_best_satd = a->i_cost8x16bi;
1821
1822         if( a->i_cost8x16bi < a->i_best_satd * 3/2 )
1823         {
1824             h->mb.i_type = a->i_mb_type8x16;
1825             h->mb.i_partition = D_8x16;
1826             a->i_cost8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
1827         }
1828         else
1829             a->i_cost8x16bi = COST_MAX;
1830     }
1831 }
1832
1833 static void refine_bidir( x264_t *h, x264_mb_analysis_t *a )
1834 {
1835     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
1836     int i;
1837
1838     switch( h->mb.i_partition )
1839     {
1840     case D_16x16:
1841         if( h->mb.i_type == B_BI_BI )
1842             x264_me_refine_bidir( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
1843         break;
1844     case D_16x8:
1845         for( i=0; i<2; i++ )
1846             if( a->i_mb_partition16x8[i] == D_BI_8x8 )
1847                 x264_me_refine_bidir( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
1848         break;
1849     case D_8x16:
1850         for( i=0; i<2; i++ )
1851             if( a->i_mb_partition8x16[i] == D_BI_8x8 )
1852                 x264_me_refine_bidir( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
1853         break;
1854     case D_8x8:
1855         for( i=0; i<4; i++ )
1856             if( h->mb.i_sub_partition[i] == D_BI_8x8 )
1857                 x264_me_refine_bidir( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
1858         break;
1859     }
1860 }
1861
1862 static inline void x264_mb_analyse_transform( x264_t *h )
1863 {
1864     h->mb.cache.b_transform_8x8_allowed =
1865         h->param.analyse.b_transform_8x8
1866         && !IS_INTRA( h->mb.i_type ) && x264_mb_transform_8x8_allowed( h );
1867
1868     if( h->mb.cache.b_transform_8x8_allowed )
1869     {
1870         int i_cost4, i_cost8;
1871         /* FIXME only luma mc is needed */
1872         x264_mb_mc( h );
1873
1874         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
1875                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
1876         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
1877                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
1878
1879         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
1880     }
1881 }
1882
1883 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_cost )
1884 {
1885     h->mb.cache.b_transform_8x8_allowed =
1886         h->param.analyse.b_transform_8x8 && x264_mb_transform_8x8_allowed( h );
1887
1888     if( h->mb.cache.b_transform_8x8_allowed )
1889     {
1890         int i_cost8;
1891         x264_analyse_update_cache( h, a );
1892         h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
1893         /* FIXME only luma is needed, but the score for comparison already includes chroma */
1894         i_cost8 = x264_rd_cost_mb( h, a->i_lambda2 );
1895
1896         if( *i_cost >= i_cost8 )
1897         {
1898             if( *i_cost > 0 )
1899                 a->i_best_satd = (int64_t)a->i_best_satd * i_cost8 / *i_cost;
1900             /* prevent a rare division by zero in x264_mb_analyse_intra */
1901             if( a->i_best_satd == 0 )
1902                 a->i_best_satd = 1;
1903
1904             *i_cost = i_cost8;
1905         }
1906         else
1907             h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
1908     }
1909 }
1910
1911
1912 /*****************************************************************************
1913  * x264_macroblock_analyse:
1914  *****************************************************************************/
1915 void x264_macroblock_analyse( x264_t *h )
1916 {
1917     x264_mb_analysis_t analysis;
1918     int i_cost = COST_MAX;
1919     int i;
1920
1921     /* init analysis */
1922     x264_mb_analyse_init( h, &analysis, x264_ratecontrol_qp( h ) );
1923
1924     /*--------------------------- Do the analysis ---------------------------*/
1925     if( h->sh.i_type == SLICE_TYPE_I )
1926     {
1927         x264_mb_analyse_intra( h, &analysis, COST_MAX );
1928
1929         i_cost = analysis.i_sad_i16x16;
1930         h->mb.i_type = I_16x16;
1931         if( analysis.i_sad_i4x4 < i_cost )
1932         {
1933             i_cost = analysis.i_sad_i4x4;
1934             h->mb.i_type = I_4x4;
1935         }
1936         if( analysis.i_sad_i8x8 < i_cost )
1937             h->mb.i_type = I_8x8;
1938
1939         if( h->mb.i_subpel_refine >= 7 )
1940             x264_intra_rd_refine( h, &analysis );
1941     }
1942     else if( h->sh.i_type == SLICE_TYPE_P )
1943     {
1944         int b_skip = 0;
1945         int i_intra_cost, i_intra_type;
1946
1947         /* Fast P_SKIP detection */
1948         analysis.b_try_pskip = 0;
1949         if( h->param.analyse.b_fast_pskip )
1950         {
1951             if( h->param.analyse.i_subpel_refine >= 3 )
1952                 analysis.b_try_pskip = 1;
1953             else if( h->mb.i_mb_type_left == P_SKIP ||
1954                      h->mb.i_mb_type_top == P_SKIP ||
1955                      h->mb.i_mb_type_topleft == P_SKIP ||
1956                      h->mb.i_mb_type_topright == P_SKIP )
1957                 b_skip = x264_macroblock_probe_pskip( h );
1958         }
1959
1960         if( b_skip )
1961         {
1962             h->mb.i_type = P_SKIP;
1963             h->mb.i_partition = D_16x16;
1964         }
1965         else
1966         {
1967             const unsigned int flags = h->param.analyse.inter;
1968             int i_type;
1969             int i_partition;
1970             int i_thresh16x8;
1971
1972             x264_mb_analyse_load_costs( h, &analysis );
1973
1974             x264_mb_analyse_inter_p16x16( h, &analysis );
1975
1976             if( h->mb.i_type == P_SKIP )
1977                 return;
1978
1979             if( flags & X264_ANALYSE_PSUB16x16 )
1980             {
1981                 if( h->param.analyse.b_mixed_references )
1982                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
1983                 else
1984                     x264_mb_analyse_inter_p8x8( h, &analysis );
1985             }
1986
1987             /* Select best inter mode */
1988             i_type = P_L0;
1989             i_partition = D_16x16;
1990             i_cost = analysis.l0.me16x16.cost;
1991
1992             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1993                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
1994             {
1995                 i_type = P_8x8;
1996                 i_partition = D_8x8;
1997                 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1998                 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1999
2000                 i_cost = analysis.l0.i_cost8x8;
2001
2002                 /* Do sub 8x8 */
2003                 if( flags & X264_ANALYSE_PSUB8x8 )
2004                 {
2005                     int i_cost_bak = i_cost;
2006                     int b_sub8x8 = 0;
2007                     for( i = 0; i < 4; i++ )
2008                     {
2009                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2010                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2011                         {
2012                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2013                             h->mb.i_sub_partition[i] = D_L0_4x4;
2014
2015                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2016                             if( analysis.l0.i_cost8x4[i] < i_cost8x8 )
2017                             {
2018                                 h->mb.i_sub_partition[i] = D_L0_8x4;
2019                                 i_cost8x8 = analysis.l0.i_cost8x4[i];
2020                             }
2021
2022                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2023                             if( analysis.l0.i_cost4x8[i] < i_cost8x8 )
2024                             {
2025                                 h->mb.i_sub_partition[i] = D_L0_4x8;
2026                                 i_cost8x8 = analysis.l0.i_cost4x8[i];
2027                             }
2028
2029                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2030                             b_sub8x8 = 1;
2031                         }
2032                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2033                     }
2034                     /* TODO: RD per subpartition */
2035                     if( b_sub8x8 && analysis.b_mbrd )
2036                     {
2037                         i_cost = x264_rd_cost_mb( h, analysis.i_lambda2 );
2038                         if( i_cost > i_cost_bak )
2039                         {
2040                             i_cost = i_cost_bak;
2041                             h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2042                             h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2043                         }
2044                     }
2045                 }
2046             }
2047
2048             /* Now do 16x8/8x16 */
2049             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2050             if( analysis.b_mbrd )
2051                 i_thresh16x8 = i_thresh16x8 * analysis.i_lambda2 / analysis.i_lambda;
2052             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2053                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2054             {
2055                 x264_mb_analyse_inter_p16x8( h, &analysis );
2056                 if( analysis.l0.i_cost16x8 < i_cost )
2057                 {
2058                     i_type = P_L0;
2059                     i_partition = D_16x8;
2060                     i_cost = analysis.l0.i_cost16x8;
2061                 }
2062
2063                 x264_mb_analyse_inter_p8x16( h, &analysis );
2064                 if( analysis.l0.i_cost8x16 < i_cost )
2065                 {
2066                     i_type = P_L0;
2067                     i_partition = D_8x16;
2068                     i_cost = analysis.l0.i_cost8x16;
2069                 }
2070             }
2071
2072             h->mb.i_partition = i_partition;
2073
2074             /* refine qpel */
2075             //FIXME mb_type costs?
2076             if( analysis.b_mbrd )
2077             {
2078                 h->mb.i_type = i_type;
2079                 x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
2080             }
2081             else if( i_partition == D_16x16 )
2082             {
2083                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2084                 i_cost = analysis.l0.me16x16.cost;
2085             }
2086             else if( i_partition == D_16x8 )
2087             {
2088                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2089                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2090                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2091             }
2092             else if( i_partition == D_8x16 )
2093             {
2094                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2095                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2096                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2097             }
2098             else if( i_partition == D_8x8 )
2099             {
2100                 int i8x8;
2101                 i_cost = 0;
2102                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2103                 {
2104                     switch( h->mb.i_sub_partition[i8x8] )
2105                     {
2106                         case D_L0_8x8:
2107                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2108                             i_cost += analysis.l0.me8x8[i8x8].cost;
2109                             break;
2110                         case D_L0_8x4:
2111                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2112                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2113                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2114                                       analysis.l0.me8x4[i8x8][1].cost;
2115                             break;
2116                         case D_L0_4x8:
2117                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2118                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2119                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2120                                       analysis.l0.me4x8[i8x8][1].cost;
2121                             break;
2122
2123                         case D_L0_4x4:
2124                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2125                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2126                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2127                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2128                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2129                                       analysis.l0.me4x4[i8x8][1].cost +
2130                                       analysis.l0.me4x4[i8x8][2].cost +
2131                                       analysis.l0.me4x4[i8x8][3].cost;
2132                             break;
2133                         default:
2134                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2135                             break;
2136                     }
2137                 }
2138             }
2139
2140             x264_mb_analyse_intra( h, &analysis, i_cost );
2141             if( h->mb.b_chroma_me && !analysis.b_mbrd &&
2142                 ( analysis.i_sad_i16x16 < i_cost
2143                || analysis.i_sad_i8x8 < i_cost
2144                || analysis.i_sad_i4x4 < i_cost ))
2145             {
2146                 x264_mb_analyse_intra_chroma( h, &analysis );
2147                 analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma;
2148                 analysis.i_sad_i8x8 += analysis.i_sad_i8x8chroma;
2149                 analysis.i_sad_i4x4 += analysis.i_sad_i8x8chroma;
2150             }
2151
2152             i_intra_type = I_16x16;
2153             i_intra_cost = analysis.i_sad_i16x16;
2154
2155             if( analysis.i_sad_i8x8 < i_intra_cost )
2156             {
2157                 i_intra_type = I_8x8;
2158                 i_intra_cost = analysis.i_sad_i8x8;
2159             }
2160             if( analysis.i_sad_i4x4 < i_intra_cost )
2161             {
2162                 i_intra_type = I_4x4;
2163                 i_intra_cost = analysis.i_sad_i4x4;
2164             }
2165
2166             if( i_intra_cost < i_cost )
2167             {
2168                 i_type = i_intra_type;
2169                 i_cost = i_intra_cost;
2170             }
2171
2172             h->mb.i_type = i_type;
2173             h->stat.frame.i_intra_cost += i_intra_cost;
2174             h->stat.frame.i_inter_cost += i_cost;
2175
2176             if( h->mb.i_subpel_refine >= 7 )
2177             {
2178                 if( IS_INTRA( h->mb.i_type ) )
2179                 {
2180                     x264_intra_rd_refine( h, &analysis );
2181                 }
2182                 else if( i_partition == D_16x16 )
2183                 {
2184                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2185                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0 );
2186                 }
2187                 else if( i_partition == D_16x8 )
2188                 {
2189                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2190                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2191                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0 );
2192                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 2 );
2193                 }
2194                 else if( i_partition == D_8x16 )
2195                 {
2196                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2197                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2198                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0 );
2199                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 1 );
2200                 }
2201                 else if( i_partition == D_8x8 )
2202                 {
2203                     int i8x8;
2204                     x264_analyse_update_cache( h, &analysis );
2205                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2206                          if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2207                              x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8 );
2208                 }
2209             }
2210         }
2211     }
2212     else if( h->sh.i_type == SLICE_TYPE_B )
2213     {
2214         int i_bskip_cost = COST_MAX;
2215         int b_skip = 0;
2216
2217         h->mb.i_type = B_SKIP;
2218         if( h->mb.b_direct_auto_write )
2219         {
2220             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2221             for( i = 0; i < 2; i++ )
2222             {
2223                 int b_changed = 1;
2224                 h->sh.b_direct_spatial_mv_pred ^= 1;
2225                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2226                 if( analysis.b_direct_available )
2227                 {
2228                     if( b_changed )
2229                     {
2230                         x264_mb_mc( h );
2231                         b_skip = x264_macroblock_probe_bskip( h );
2232                     }
2233                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2234                 }
2235                 else
2236                     b_skip = 0;
2237             }
2238         }
2239         else
2240             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2241
2242         if( analysis.b_direct_available )
2243         {
2244             if( !h->mb.b_direct_auto_write )
2245                 x264_mb_mc( h );
2246             if( h->mb.b_lossless )
2247             {
2248                 /* chance of skip is too small to bother */
2249             }
2250             else if( analysis.b_mbrd )
2251             {
2252                 i_bskip_cost = ssd_mb( h );
2253
2254                 /* 6 = minimum cavlc cost of a non-skipped MB */
2255                 if( i_bskip_cost <= 6 * analysis.i_lambda2 )
2256                 {
2257                     h->mb.i_type = B_SKIP;
2258                     x264_analyse_update_cache( h, &analysis );
2259                     return;
2260                 }
2261             }
2262             else if( !h->mb.b_direct_auto_write )
2263             {
2264                 /* Conditioning the probe on neighboring block types
2265                  * doesn't seem to help speed or quality. */
2266                 b_skip = x264_macroblock_probe_bskip( h );
2267             }
2268         }
2269
2270         if( !b_skip )
2271         {
2272             const unsigned int flags = h->param.analyse.inter;
2273             int i_type;
2274             int i_partition;
2275
2276             x264_mb_analyse_load_costs( h, &analysis );
2277
2278             /* select best inter mode */
2279             /* direct must be first */
2280             if( analysis.b_direct_available )
2281                 x264_mb_analyse_inter_direct( h, &analysis );
2282
2283             x264_mb_analyse_inter_b16x16( h, &analysis );
2284
2285             i_type = B_L0_L0;
2286             i_partition = D_16x16;
2287             i_cost = analysis.l0.me16x16.cost;
2288             if( analysis.l1.me16x16.cost < i_cost )
2289             {
2290                 i_type = B_L1_L1;
2291                 i_cost = analysis.l1.me16x16.cost;
2292             }
2293             if( analysis.i_cost16x16bi < i_cost )
2294             {
2295                 i_type = B_BI_BI;
2296                 i_cost = analysis.i_cost16x16bi;
2297             }
2298             if( analysis.i_cost16x16direct < i_cost )
2299             {
2300                 i_type = B_DIRECT;
2301                 i_cost = analysis.i_cost16x16direct;
2302             }
2303
2304             if( i_bskip_cost <= i_cost )
2305             {
2306                 h->mb.i_type = B_SKIP;
2307                 x264_analyse_update_cache( h, &analysis );
2308                 return;
2309             }
2310
2311             if( flags & X264_ANALYSE_BSUB16x16 )
2312             {
2313                 x264_mb_analyse_inter_b8x8( h, &analysis );
2314                 if( analysis.i_cost8x8bi < i_cost )
2315                 {
2316                     i_type = B_8x8;
2317                     i_partition = D_8x8;
2318                     i_cost = analysis.i_cost8x8bi;
2319
2320                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2321                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2322                     {
2323                         x264_mb_analyse_inter_b16x8( h, &analysis );
2324                         if( analysis.i_cost16x8bi < i_cost )
2325                         {
2326                             i_partition = D_16x8;
2327                             i_cost = analysis.i_cost16x8bi;
2328                             i_type = analysis.i_mb_type16x8;
2329                         }
2330                     }
2331                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2332                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2333                     {
2334                         x264_mb_analyse_inter_b8x16( h, &analysis );
2335                         if( analysis.i_cost8x16bi < i_cost )
2336                         {
2337                             i_partition = D_8x16;
2338                             i_cost = analysis.i_cost8x16bi;
2339                             i_type = analysis.i_mb_type8x16;
2340                         }
2341                     }
2342                 }
2343             }
2344
2345             h->mb.i_partition = i_partition;
2346
2347             if( analysis.b_mbrd )
2348             {
2349                 h->mb.i_type = i_type;
2350                 x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
2351             }
2352             /* refine qpel */
2353             else if( i_partition == D_16x16 )
2354             {
2355                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2356                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2357                 if( i_type == B_L0_L0 )
2358                 {
2359                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2360                     i_cost = analysis.l0.me16x16.cost
2361                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2362                 }
2363                 else if( i_type == B_L1_L1 )
2364                 {
2365                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2366                     i_cost = analysis.l1.me16x16.cost
2367                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2368                 }
2369                 else if( i_type == B_BI_BI )
2370                 {
2371                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2372                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2373                 }
2374             }
2375             else if( i_partition == D_16x8 )
2376             {
2377                 for( i=0; i<2; i++ )
2378                 {
2379                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2380                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2381                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2382                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2383                 }
2384             }
2385             else if( i_partition == D_8x16 )
2386             {
2387                 for( i=0; i<2; i++ )
2388                 {
2389                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2390                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2391                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2392                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2393                 }
2394             }
2395             else if( i_partition == D_8x8 )
2396             {
2397                 for( i=0; i<4; i++ )
2398                 {
2399                     x264_me_t *m;
2400                     int i_part_cost_old;
2401                     int i_type_cost;
2402                     int i_part_type = h->mb.i_sub_partition[i];
2403                     int b_bidir = (i_part_type == D_BI_8x8);
2404
2405                     if( i_part_type == D_DIRECT_8x8 )
2406                         continue;
2407                     if( x264_mb_partition_listX_table[0][i_part_type] )
2408                     {
2409                         m = &analysis.l0.me8x8[i];
2410                         i_part_cost_old = m->cost;
2411                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2412                         m->cost -= i_type_cost;
2413                         x264_me_refine_qpel( h, m );
2414                         if( !b_bidir )
2415                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2416                     }
2417                     if( x264_mb_partition_listX_table[1][i_part_type] )
2418                     {
2419                         m = &analysis.l1.me8x8[i];
2420                         i_part_cost_old = m->cost;
2421                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2422                         m->cost -= i_type_cost;
2423                         x264_me_refine_qpel( h, m );
2424                         if( !b_bidir )
2425                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2426                     }
2427                     /* TODO: update mvp? */
2428                 }
2429             }
2430
2431             /* best intra mode */
2432             x264_mb_analyse_intra( h, &analysis, i_cost );
2433
2434             if( analysis.i_sad_i16x16 < i_cost )
2435             {
2436                 i_type = I_16x16;
2437                 i_cost = analysis.i_sad_i16x16;
2438             }
2439             if( analysis.i_sad_i8x8 < i_cost )
2440             {
2441                 i_type = I_8x8;
2442                 i_cost = analysis.i_sad_i8x8;
2443             }
2444             if( analysis.i_sad_i4x4 < i_cost )
2445             {
2446                 i_type = I_4x4;
2447                 i_cost = analysis.i_sad_i4x4;
2448             }
2449
2450             h->mb.i_type = i_type;
2451
2452             if( h->param.analyse.b_bidir_me )
2453                 refine_bidir( h, &analysis );
2454         }
2455     }
2456
2457     x264_analyse_update_cache( h, &analysis );
2458
2459     if( !analysis.b_mbrd )
2460         x264_mb_analyse_transform( h );
2461
2462     h->mb.b_trellis = h->param.analyse.i_trellis;
2463     h->mb.b_noise_reduction = h->param.analyse.i_noise_reduction;
2464 }
2465
2466 /*-------------------- Update MB from the analysis ----------------------*/
2467 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
2468 {
2469     int i;
2470
2471     switch( h->mb.i_type )
2472     {
2473         case I_4x4:
2474             for( i = 0; i < 16; i++ )
2475             {
2476                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
2477                     a->i_predict4x4[block_idx_x[i]][block_idx_y[i]];
2478             }
2479
2480             x264_mb_analyse_intra_chroma( h, a );
2481             break;
2482         case I_8x8:
2483             for( i = 0; i < 4; i++ )
2484                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1),
2485                     a->i_predict8x8[i&1][i>>1] );
2486
2487             x264_mb_analyse_intra_chroma( h, a );
2488             break;
2489         case I_16x16:
2490             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2491             x264_mb_analyse_intra_chroma( h, a );
2492             break;
2493
2494         case P_L0:
2495             switch( h->mb.i_partition )
2496             {
2497                 case D_16x16:
2498                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2499                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
2500                     break;
2501
2502                 case D_16x8:
2503                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2504                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2505                     x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] );
2506                     x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] );
2507                     break;
2508
2509                 case D_8x16:
2510                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2511                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2512                     x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] );
2513                     x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] );
2514                     break;
2515
2516                 default:
2517                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
2518                     break;
2519             }
2520             break;
2521
2522         case P_8x8:
2523             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2524             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2525             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2526             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2527             for( i = 0; i < 4; i++ )
2528                 x264_mb_cache_mv_p8x8( h, a, i );
2529             break;
2530
2531         case P_SKIP:
2532         {
2533             int mvp[2];
2534             x264_mb_predict_mv_pskip( h, mvp );
2535             /* */
2536             h->mb.i_partition = D_16x16;
2537             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
2538             x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
2539             break;
2540         }
2541
2542         case B_SKIP:
2543         case B_DIRECT:
2544             x264_mb_load_mv_direct8x8( h, 0 );
2545             x264_mb_load_mv_direct8x8( h, 1 );
2546             x264_mb_load_mv_direct8x8( h, 2 );
2547             x264_mb_load_mv_direct8x8( h, 3 );
2548             break;
2549
2550         case B_8x8:
2551             /* optimize: cache might not need to be rewritten */
2552             for( i = 0; i < 4; i++ )
2553                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
2554             break;
2555
2556         default: /* the rest of the B types */
2557             switch( h->mb.i_partition )
2558             {
2559             case D_16x16:
2560                 switch( h->mb.i_type )
2561                 {
2562                 case B_L0_L0:
2563                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2564                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
2565
2566                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
2567                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
2568                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
2569                     break;
2570                 case B_L1_L1:
2571                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
2572                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
2573                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
2574
2575                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
2576                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
2577                     break;
2578                 case B_BI_BI:
2579                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2580                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
2581
2582                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
2583                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
2584                     break;
2585                 }
2586                 break;
2587             case D_16x8:
2588                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
2589                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
2590                 break;
2591             case D_8x16:
2592                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
2593                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
2594                 break;
2595             default:
2596                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
2597                 break;
2598             }
2599     }
2600 }
2601
2602 #include "slicetype_decision.c"
2603