git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 x264 project
   5  * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Loren Merritt <lorenm@u.washington.edu>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include <stdlib.h>
  26 #include <stdio.h>
  27 #include <string.h>
  28 #include <math.h>
  29 #include <limits.h>
  30
  31 #include "common/common.h"
  32 #include "common/macroblock.h"
  33 #include "macroblock.h"
  34 #include "me.h"
  35 #include "ratecontrol.h"
  36 #include "analyse.h"
  37 #include "rdo.c"
  38
  39 typedef struct
  40 {
  41     /* 16x16 */
  42     int i_ref;
  43     x264_me_t me16x16;
  44
  45     /* 8x8 */
  46     int       i_cost8x8;
  47     int       mvc[16][5][2]; /* [ref][0] is 16x16 mv,
  48                                 [ref][1..4] are 8x8 mv from partition [0..3] */
  49     x264_me_t me8x8[4];
  50
  51     /* Sub 4x4 */
  52     int       i_cost4x4[4]; /* cost per 8x8 partition */
  53     x264_me_t me4x4[4][4];
  54
  55     /* Sub 8x4 */
  56     int       i_cost8x4[4]; /* cost per 8x8 partition */
  57     x264_me_t me8x4[4][2];
  58
  59     /* Sub 4x8 */
  60     int       i_cost4x8[4]; /* cost per 8x8 partition */
  61     x264_me_t me4x8[4][4];
  62
  63     /* 16x8 */
  64     int       i_cost16x8;
  65     x264_me_t me16x8[2];
  66
  67     /* 8x16 */
  68     int       i_cost8x16;
  69     x264_me_t me8x16[2];
  70
  71 } x264_mb_analysis_list_t;
  72
  73 typedef struct
  74 {
  75     /* conduct the analysis using this lamda and QP */
  76     int i_lambda;
  77     int i_lambda2;
  78     int i_qp;
  79     int16_t *p_cost_mv;
  80     int b_mbrd;
  81
  82
  83     /* I: Intra part */
  84     /* Take some shortcuts in intra search if intra is deemed unlikely */
  85     int b_fast_intra;
  86     int i_best_satd;
  87
  88     /* Luma part */
  89     int i_sad_i16x16;
  90     int i_predict16x16;
  91
  92     int i_sad_i8x8;
  93     int i_predict8x8[2][2];
  94
  95     int i_sad_i4x4;
  96     int i_predict4x4[4][4];
  97
  98     /* Chroma part */
  99     int i_sad_i8x8chroma;
 100     int i_predict8x8chroma;
 101
 102     /* II: Inter part P/B frame */
 103     x264_mb_analysis_list_t l0;
 104     x264_mb_analysis_list_t l1;
 105
 106     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 107     int i_cost16x16direct;
 108     int i_cost8x8bi;
 109     int i_cost8x8direct[4];
 110     int i_cost16x8bi;
 111     int i_cost8x16bi;
 112
 113     int i_mb_partition16x8[2]; /* mb_partition_e */
 114     int i_mb_partition8x16[2];
 115     int i_mb_type16x8; /* mb_class_e */
 116     int i_mb_type8x16;
 117
 118     int b_direct_available;
 119
 120 } x264_mb_analysis_t;
 121
 122 /* lambda = pow(2,qp/6-2) */
 123 static const int i_qp0_cost_table[52] = {
 124    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 125    1, 1, 1, 1,              /*  8-11 */
 126    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 127    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 128    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 129   16,18,20,23,25,29,32,36,  /* 36-43 */
 130   40,45,51,57,64,72,81,91   /* 44-51 */
 131 };
 132
 133 /* pow(lambda,2) * .9 */
 134 static const int i_qp0_cost2_table[52] = {
 135    1,   1,   1,   1,   1,   1, /*  0-5  */
 136    1,   1,   1,   1,   1,   1, /*  6-11 */
 137    1,   1,   1,   2,   2,   3, /* 12-17 */
 138    4,   5,   6,   7,   9,  11, /* 18-23 */
 139   14,  18,  23,  29,  36,  46, /* 24-29 */
 140   58,  73,  91, 115, 145, 183, /* 30-35 */
 141  230, 290, 366, 461, 581, 731, /* 36-41 */
 142  922,1161,1463,1843,2322,2926, /* 42-47 */
 143 3686,4645,5852,7373
 144 };
 145
 146 static const uint8_t block_idx_x[16] = {
 147     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
 148 };
 149 static const uint8_t block_idx_y[16] = {
 150     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
 151 };
 152
 153 /* TODO: calculate CABAC costs */
 154 static const int i_mb_b_cost_table[19] = {
 155     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 156 };
 157 static const int i_mb_b16x8_cost_table[17] = {
 158     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 159 };
 160 static const int i_sub_mb_b_cost_table[13] = {
 161     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 162 };
 163 static const int i_sub_mb_p_cost_table[4] = {
 164     5, 3, 3, 1
 165 };
 166
 167 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 168
 169 /* initialize an array of lambda*nbits for all possible mvs */
 170 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 171 {
 172     static int16_t *p_cost_mv[52];
 173
 174     if( !p_cost_mv[a->i_qp] )
 175     {
 176         /* could be faster, but isn't called many times */
 177         /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 178         int i;
 179         p_cost_mv[a->i_qp] = x264_malloc( (4*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) );
 180         p_cost_mv[a->i_qp] += 2*4*h->param.analyse.i_mv_range;
 181         for( i = 0; i <= 2*4*h->param.analyse.i_mv_range; i++ )
 182         {
 183             p_cost_mv[a->i_qp][-i] =
 184             p_cost_mv[a->i_qp][i]  = a->i_lambda * bs_size_se( i );
 185         }
 186     }
 187
 188     a->p_cost_mv = p_cost_mv[a->i_qp];
 189 }
 190
 191 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 192 {
 193     memset( a, 0, sizeof( x264_mb_analysis_t ) );
 194
 195     /* conduct the analysis using this lamda and QP */
 196     a->i_qp = h->mb.i_qp = i_qp;
 197     a->i_lambda = i_qp0_cost_table[i_qp];
 198     a->i_lambda2 = i_qp0_cost2_table[i_qp];
 199     a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 && h->sh.i_type != SLICE_TYPE_B;
 200
 201     h->mb.i_me_method = h->param.analyse.i_me_method;
 202     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 203     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 204                         && h->mb.i_subpel_refine >= 5;
 205
 206     h->mb.b_transform_8x8 = 0;
 207
 208     /* I: Intra part */
 209     a->i_sad_i16x16 =
 210     a->i_sad_i8x8   =
 211     a->i_sad_i4x4   =
 212     a->i_sad_i8x8chroma = COST_MAX;
 213
 214     a->b_fast_intra = 0;
 215     a->i_best_satd = COST_MAX;
 216
 217     /* II: Inter part P/B frame */
 218     if( h->sh.i_type != SLICE_TYPE_I )
 219     {
 220         int i;
 221         int i_fmv_range = h->param.analyse.i_mv_range - 16;
 222
 223         /* Calculate max allowed MV range */
 224 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
 225         h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
 226         h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 );
 227         h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
 228         h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
 229         if( h->mb.i_mb_x == 0)
 230         {
 231             h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
 232             h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 );
 233             h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
 234             h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
 235         }
 236 #undef CLIP_FMV
 237
 238         a->l0.me16x16.cost =
 239         a->l0.i_cost8x8    = COST_MAX;
 240
 241         for( i = 0; i < 4; i++ )
 242         {
 243             a->l0.i_cost4x4[i] =
 244             a->l0.i_cost8x4[i] =
 245             a->l0.i_cost4x8[i] = COST_MAX;
 246         }
 247
 248         a->l0.i_cost16x8   =
 249         a->l0.i_cost8x16   = COST_MAX;
 250         if( h->sh.i_type == SLICE_TYPE_B )
 251         {
 252             a->l1.me16x16.cost =
 253             a->l1.i_cost8x8    = COST_MAX;
 254
 255             for( i = 0; i < 4; i++ )
 256             {
 257                 a->l1.i_cost4x4[i] =
 258                 a->l1.i_cost8x4[i] =
 259                 a->l1.i_cost4x8[i] =
 260                 a->i_cost8x8direct[i] = COST_MAX;
 261             }
 262
 263             a->l1.i_cost16x8   =
 264             a->l1.i_cost8x16   =
 265
 266             a->i_cost16x16bi   =
 267             a->i_cost16x16direct =
 268             a->i_cost8x8bi     =
 269             a->i_cost16x8bi    =
 270             a->i_cost8x16bi    = COST_MAX;
 271         }
 272
 273         /* Fast intra decision */
 274         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 275         {
 276             if( a->b_mbrd
 277                || IS_INTRA( h->mb.i_mb_type_left )
 278                || IS_INTRA( h->mb.i_mb_type_top )
 279                || IS_INTRA( h->mb.i_mb_type_topleft )
 280                || IS_INTRA( h->mb.i_mb_type_topright )
 281                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 282                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 283             { /* intra is likely */ }
 284             else
 285             {
 286                 a->b_fast_intra = 1;
 287             }
 288         }
 289     }
 290 }
 291
 292
 293
 294 /*
 295  * Handle intra mb
 296  */
 297 /* Max = 4 */
 298 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 299 {
 300     if( i_neighbour & MB_TOPLEFT )
 301     {
 302         /* top and left avaible */
 303         *mode++ = I_PRED_16x16_V;
 304         *mode++ = I_PRED_16x16_H;
 305         *mode++ = I_PRED_16x16_DC;
 306         *mode++ = I_PRED_16x16_P;
 307         *pi_count = 4;
 308     }
 309     else if( i_neighbour & MB_LEFT )
 310     {
 311         /* left available*/
 312         *mode++ = I_PRED_16x16_DC_LEFT;
 313         *mode++ = I_PRED_16x16_H;
 314         *pi_count = 2;
 315     }
 316     else if( i_neighbour & MB_TOP )
 317     {
 318         /* top available*/
 319         *mode++ = I_PRED_16x16_DC_TOP;
 320         *mode++ = I_PRED_16x16_V;
 321         *pi_count = 2;
 322     }
 323     else
 324     {
 325         /* none avaible */
 326         *mode = I_PRED_16x16_DC_128;
 327         *pi_count = 1;
 328     }
 329 }
 330
 331 /* Max = 4 */
 332 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 333 {
 334     if( i_neighbour & MB_TOPLEFT )
 335     {
 336         /* top and left avaible */
 337         *mode++ = I_PRED_CHROMA_V;
 338         *mode++ = I_PRED_CHROMA_H;
 339         *mode++ = I_PRED_CHROMA_DC;
 340         *mode++ = I_PRED_CHROMA_P;
 341         *pi_count = 4;
 342     }
 343     else if( i_neighbour & MB_LEFT )
 344     {
 345         /* left available*/
 346         *mode++ = I_PRED_CHROMA_DC_LEFT;
 347         *mode++ = I_PRED_CHROMA_H;
 348         *pi_count = 2;
 349     }
 350     else if( i_neighbour & MB_TOP )
 351     {
 352         /* top available*/
 353         *mode++ = I_PRED_CHROMA_DC_TOP;
 354         *mode++ = I_PRED_CHROMA_V;
 355         *pi_count = 2;
 356     }
 357     else
 358     {
 359         /* none avaible */
 360         *mode = I_PRED_CHROMA_DC_128;
 361         *pi_count = 1;
 362     }
 363 }
 364
 365 /* MAX = 9 */
 366 static void predict_4x4_mode_available( unsigned int i_neighbour,
 367                                         int *mode, int *pi_count )
 368 {
 369     int b_l = i_neighbour & MB_LEFT;
 370     int b_t = i_neighbour & MB_TOP;
 371
 372     if( b_l && b_t )
 373     {
 374         *mode++ = I_PRED_4x4_DC;
 375         *mode++ = I_PRED_4x4_H;
 376         *mode++ = I_PRED_4x4_V;
 377         *mode++ = I_PRED_4x4_DDL;
 378         *mode++ = I_PRED_4x4_DDR;
 379         *mode++ = I_PRED_4x4_VR;
 380         *mode++ = I_PRED_4x4_HD;
 381         *mode++ = I_PRED_4x4_VL;
 382         *mode++ = I_PRED_4x4_HU;
 383         *pi_count = 9;
 384     }
 385     else if( b_l )
 386     {
 387         *mode++ = I_PRED_4x4_DC_LEFT;
 388         *mode++ = I_PRED_4x4_H;
 389         *mode++ = I_PRED_4x4_HU;
 390         *pi_count = 3;
 391     }
 392     else if( b_t )
 393     {
 394         *mode++ = I_PRED_4x4_DC_TOP;
 395         *mode++ = I_PRED_4x4_V;
 396         *mode++ = I_PRED_4x4_DDL;
 397         *mode++ = I_PRED_4x4_VL;
 398         *pi_count = 4;
 399     }
 400     else
 401     {
 402         *mode++ = I_PRED_4x4_DC_128;
 403         *pi_count = 1;
 404     }
 405 }
 406
 407 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 408 {
 409     int i;
 410
 411     int i_max;
 412     int predict_mode[9];
 413
 414     uint8_t *p_dstc[2], *p_srcc[2];
 415     int      i_stride[2];
 416
 417     if( a->i_sad_i8x8chroma < COST_MAX )
 418         return;
 419
 420     /* 8x8 prediction selection for chroma */
 421     p_dstc[0] = h->mb.pic.p_fdec[1];
 422     p_dstc[1] = h->mb.pic.p_fdec[2];
 423     p_srcc[0] = h->mb.pic.p_fenc[1];
 424     p_srcc[1] = h->mb.pic.p_fenc[2];
 425
 426     i_stride[0] = h->mb.pic.i_stride[1];
 427     i_stride[1] = h->mb.pic.i_stride[2];
 428
 429     predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 430     a->i_sad_i8x8chroma = COST_MAX;
 431     for( i = 0; i < i_max; i++ )
 432     {
 433         int i_sad;
 434         int i_mode;
 435
 436         i_mode = predict_mode[i];
 437
 438         /* we do the prediction */
 439         h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
 440         h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
 441
 442         /* we calculate the cost */
 443         i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], i_stride[0],
 444                                           p_srcc[0], i_stride[0] ) +
 445                 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], i_stride[1],
 446                                           p_srcc[1], i_stride[1] ) +
 447                 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 448
 449         /* if i_score is lower it is better */
 450         if( a->i_sad_i8x8chroma > i_sad )
 451         {
 452             a->i_predict8x8chroma = i_mode;
 453             a->i_sad_i8x8chroma   = i_sad;
 454         }
 455     }
 456
 457     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 458 }
 459
 460 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_inter )
 461 {
 462     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 463     const int i_stride = h->mb.pic.i_stride[0];
 464     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 465     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 466     int      f8_satd_rd_ratio = 0;
 467
 468     int i, idx;
 469     int i_max;
 470     int predict_mode[9];
 471
 472     const int i_satd_thresh = a->i_best_satd * 5/4 + a->i_lambda * 10;
 473
 474     /*---------------- Try all mode and calculate their score ---------------*/
 475
 476     /* 16x16 prediction selection */
 477     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 478     for( i = 0; i < i_max; i++ )
 479     {
 480         int i_sad;
 481         int i_mode;
 482
 483         i_mode = predict_mode[i];
 484         h->predict_16x16[i_mode]( p_dst, i_stride );
 485
 486         i_sad = h->pixf.mbcmp[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
 487                 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 488         if( a->i_sad_i16x16 > i_sad )
 489         {
 490             a->i_predict16x16 = i_mode;
 491             a->i_sad_i16x16   = i_sad;
 492         }
 493     }
 494
 495     if( a->b_mbrd )
 496     {
 497         f8_satd_rd_ratio = ((unsigned)i_cost_inter << 8) / a->i_best_satd + 1;
 498         x264_mb_analyse_intra_chroma( h, a );
 499         if( h->mb.b_chroma_me )
 500             a->i_sad_i16x16 += a->i_sad_i8x8chroma;
 501         if( a->i_sad_i16x16 < i_satd_thresh )
 502         {
 503             h->mb.i_type = I_16x16;
 504             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
 505             a->i_sad_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 506         }
 507         else
 508             a->i_sad_i16x16 = a->i_sad_i16x16 * f8_satd_rd_ratio >> 8;
 509     }
 510     else
 511     {
 512         if( h->sh.i_type == SLICE_TYPE_B )
 513             /* cavlc mb type prefix */
 514             a->i_sad_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 515         if( a->b_fast_intra && a->i_sad_i16x16 > 2*i_cost_inter )
 516             return;
 517     }
 518
 519     /* 4x4 prediction selection */
 520     if( flags & X264_ANALYSE_I4x4 )
 521     {
 522         a->i_sad_i4x4 = 0;
 523         for( idx = 0; idx < 16; idx++ )
 524         {
 525             uint8_t *p_src_by;
 526             uint8_t *p_dst_by;
 527             int     i_best;
 528             int x, y;
 529             int i_pred_mode;
 530
 531             i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
 532             x = block_idx_x[idx];
 533             y = block_idx_y[idx];
 534
 535             p_src_by = p_src + 4 * x + 4 * y * i_stride;
 536             p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
 537
 538             i_best = COST_MAX;
 539             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 540
 541             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 542                 /* emulate missing topright samples */
 543                 *(uint32_t*) &p_dst_by[4 - i_stride] = p_dst_by[3 - i_stride] * 0x01010101U;
 544
 545             for( i = 0; i < i_max; i++ )
 546             {
 547                 int i_sad;
 548                 int i_mode;
 549
 550                 i_mode = predict_mode[i];
 551                 h->predict_4x4[i_mode]( p_dst_by, i_stride );
 552
 553                 i_sad = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, i_stride,
 554                                                   p_src_by, i_stride )
 555                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 556
 557                 if( i_best > i_sad )
 558                 {
 559                     a->i_predict4x4[x][y] = i_mode;
 560                     i_best = i_sad;
 561                 }
 562             }
 563             a->i_sad_i4x4 += i_best;
 564
 565             /* we need to encode this block now (for next ones) */
 566             h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by, i_stride );
 567             x264_mb_encode_i4x4( h, idx, a->i_qp );
 568
 569             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
 570         }
 571
 572         a->i_sad_i4x4 += a->i_lambda * 24;    /* from JVT (SATD0) */
 573         if( a->b_mbrd )
 574         {
 575             if( h->mb.b_chroma_me )
 576                 a->i_sad_i4x4 += a->i_sad_i8x8chroma;
 577             if( a->i_sad_i4x4 < i_satd_thresh )
 578             {
 579                 h->mb.i_type = I_4x4;
 580                 a->i_sad_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 581             }
 582             else
 583                 a->i_sad_i4x4 = a->i_sad_i4x4 * f8_satd_rd_ratio >> 8;
 584         }
 585         else
 586         {
 587             if( h->sh.i_type == SLICE_TYPE_B )
 588                 a->i_sad_i4x4 += a->i_lambda * i_mb_b_cost_table[I_4x4];
 589         }
 590     }
 591
 592     /* 8x8 prediction selection */
 593     if( flags & X264_ANALYSE_I8x8 )
 594     {
 595         a->i_sad_i8x8 = 0;
 596         for( idx = 0; idx < 4; idx++ )
 597         {
 598             uint8_t *p_src_by;
 599             uint8_t *p_dst_by;
 600             int     i_best;
 601             int x, y;
 602             int i_pred_mode;
 603
 604             i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
 605             x = idx&1;
 606             y = idx>>1;
 607
 608             p_src_by = p_src + 8 * x + 8 * y * i_stride;
 609             p_dst_by = p_dst + 8 * x + 8 * y * i_stride;
 610
 611             i_best = COST_MAX;
 612             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
 613             for( i = 0; i < i_max; i++ )
 614             {
 615                 int i_sad;
 616                 int i_mode;
 617
 618                 i_mode = predict_mode[i];
 619                 h->predict_8x8[i_mode]( p_dst_by, i_stride, h->mb.i_neighbour );
 620
 621                 /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
 622                 i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, i_stride,
 623                                                   p_src_by, i_stride )
 624                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 625
 626                 if( i_best > i_sad )
 627                 {
 628                     a->i_predict8x8[x][y] = i_mode;
 629                     i_best = i_sad;
 630                 }
 631             }
 632             a->i_sad_i8x8 += i_best;
 633
 634             /* we need to encode this block now (for next ones) */
 635             h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
 636             x264_mb_encode_i8x8( h, idx, a->i_qp );
 637
 638             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
 639         }
 640
 641         if( a->b_mbrd )
 642         {
 643             if( h->mb.b_chroma_me )
 644                 a->i_sad_i8x8 += a->i_sad_i8x8chroma;
 645             if( a->i_sad_i8x8 < i_satd_thresh )
 646             {
 647                 h->mb.i_type = I_8x8;
 648                 a->i_sad_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 649             }
 650             else
 651                 a->i_sad_i8x8 = a->i_sad_i8x8 * f8_satd_rd_ratio >> 8;
 652         }
 653         else
 654         {
 655             // FIXME some bias like in i4x4?
 656             if( h->sh.i_type == SLICE_TYPE_B )
 657                 a->i_sad_i8x8 += a->i_lambda * i_mb_b_cost_table[I_8x8];
 658         }
 659     }
 660 }
 661
 662 #define LOAD_FENC( m, src, xoff, yoff) \
 663     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
 664     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
 665     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 666     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 667     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 668
 669 #define LOAD_HPELS(m, src, xoff, yoff) \
 670     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 671     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
 672     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
 673     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
 674     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 675     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 676
 677 #define REF_COST(list, ref) \
 678     (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref ))
 679
 680 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 681 {
 682     x264_me_t m;
 683     int i_ref;
 684     int mvc[7][2], i_mvc;
 685     int i_fullpel_thresh = INT_MAX;
 686     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 687
 688     /* 16x16 Search on all ref frame */
 689     m.i_pixel = PIXEL_16x16;
 690     m.p_cost_mv = a->p_cost_mv;
 691     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 692
 693     a->l0.me16x16.cost = INT_MAX;
 694     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 695     {
 696         const int i_ref_cost = REF_COST( 0, i_ref );
 697         i_fullpel_thresh -= i_ref_cost;
 698         m.i_ref_cost = i_ref_cost;
 699         m.i_ref = i_ref;
 700
 701         /* search with ref */
 702         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 703         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 704         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 705         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 706
 707         m.cost += i_ref_cost;
 708         i_fullpel_thresh += i_ref_cost;
 709
 710         if( m.cost < a->l0.me16x16.cost )
 711             a->l0.me16x16 = m;
 712
 713         /* save mv for predicting neighbors */
 714         a->l0.mvc[i_ref][0][0] =
 715         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 716         a->l0.mvc[i_ref][0][1] =
 717         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 718     }
 719
 720     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
 721
 722     if( a->b_mbrd )
 723     {
 724         a->i_best_satd = a->l0.me16x16.cost;
 725         h->mb.i_type = P_L0;
 726         h->mb.i_partition = D_16x16;
 727         x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
 728         a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
 729     }
 730 }
 731
 732 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
 733 {
 734     x264_me_t m;
 735     int i_ref;
 736     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 737     int i_fullpel_thresh = INT_MAX;
 738     int *p_fullpel_thresh = /*h->i_ref0>1 ? &i_fullpel_thresh : */NULL;
 739     int i;
 740     int i_maxref = h->i_ref0-1;
 741
 742     h->mb.i_partition = D_8x8;
 743
 744     /* early termination: if 16x16 chose ref 0, then evalute no refs older
 745      * than those used by the neighbors */
 746     if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
 747         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
 748     {
 749         i_maxref = 0;
 750         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
 751         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
 752         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
 753         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
 754         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
 755         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
 756     }
 757
 758     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
 759     {
 760          a->l0.mvc[i_ref][0][0] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0];
 761          a->l0.mvc[i_ref][0][1] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1];
 762     }
 763
 764     for( i = 0; i < 4; i++ )
 765     {
 766         x264_me_t *l0m = &a->l0.me8x8[i];
 767         const int x8 = i%2;
 768         const int y8 = i/2;
 769
 770         m.i_pixel = PIXEL_8x8;
 771         m.p_cost_mv = a->p_cost_mv;
 772
 773         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
 774         l0m->cost = INT_MAX;
 775         for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
 776         {
 777              const int i_ref_cost = REF_COST( 0, i_ref );
 778              i_fullpel_thresh -= i_ref_cost;
 779              m.i_ref_cost = i_ref_cost;
 780              m.i_ref = i_ref;
 781
 782              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*x8, 8*y8 );
 783              x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
 784              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
 785              x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_fullpel_thresh );
 786
 787              m.cost += i_ref_cost;
 788              i_fullpel_thresh += i_ref_cost;
 789              *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv;
 790
 791              if( m.cost < l0m->cost )
 792                  *l0m = m;
 793         }
 794         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv[0], l0m->mv[1] );
 795         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
 796
 797         /* mb type cost */
 798         l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
 799     }
 800
 801     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
 802                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
 803     if( a->b_mbrd )
 804     {
 805         if( a->i_best_satd > a->l0.i_cost8x8 )
 806             a->i_best_satd = a->l0.i_cost8x8;
 807         h->mb.i_type = P_8x8;
 808         h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
 809         h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
 810         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 811     }
 812 }
 813
 814 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 815 {
 816     const int i_ref = a->l0.me16x16.i_ref;
 817     const int i_ref_cost = REF_COST( 0, i_ref );
 818     uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
 819     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 820     int i_mvc;
 821     int (*mvc)[2] = a->l0.mvc[i_ref];
 822     int i;
 823
 824     /* XXX Needed for x264_mb_predict_mv */
 825     h->mb.i_partition = D_8x8;
 826
 827     i_mvc = 1;
 828     *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.me16x16.mv;
 829
 830     for( i = 0; i < 4; i++ )
 831     {
 832         x264_me_t *m = &a->l0.me8x8[i];
 833         const int x8 = i%2;
 834         const int y8 = i/2;
 835
 836         m->i_pixel = PIXEL_8x8;
 837         m->p_cost_mv = a->p_cost_mv;
 838         m->i_ref_cost = i_ref_cost;
 839         m->i_ref = i_ref;
 840
 841         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
 842         LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
 843         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 844         x264_me_search( h, m, mvc, i_mvc );
 845
 846         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 847
 848         *(uint64_t*)mvc[i_mvc] = *(uint64_t*)m->mv;
 849         i_mvc++;
 850
 851         /* mb type cost */
 852         m->cost += i_ref_cost;
 853         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
 854     }
 855
 856     /* theoretically this should include 4*ref_cost,
 857      * but 3 seems a better approximation of cabac. */
 858     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
 859                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost -
 860                       REF_COST( 0, a->l0.me16x16.i_ref );
 861     if( a->b_mbrd )
 862     {
 863         if( a->i_best_satd > a->l0.i_cost8x8 )
 864             a->i_best_satd = a->l0.i_cost8x8;
 865         h->mb.i_type = P_8x8;
 866         h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
 867         h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
 868         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 869     }
 870 }
 871
 872 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 873 {
 874     x264_me_t m;
 875     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 876     int mvc[3][2];
 877     int i, j;
 878
 879     /* XXX Needed for x264_mb_predict_mv */
 880     h->mb.i_partition = D_16x8;
 881
 882     for( i = 0; i < 2; i++ )
 883     {
 884         x264_me_t *l0m = &a->l0.me16x8[i];
 885         const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
 886         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
 887
 888         m.i_pixel = PIXEL_16x8;
 889         m.p_cost_mv = a->p_cost_mv;
 890
 891         LOAD_FENC( &m, p_fenc, 0, 8*i );
 892         l0m->cost = INT_MAX;
 893         for( j = 0; j < i_ref8s; j++ )
 894         {
 895              const int i_ref = ref8[j];
 896              const int i_ref_cost = REF_COST( 0, i_ref );
 897              m.i_ref_cost = i_ref_cost;
 898              m.i_ref = i_ref;
 899
 900              /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
 901              *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
 902              *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1];
 903              *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2];
 904
 905              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 8*i );
 906              x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
 907              x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
 908              x264_me_search( h, &m, mvc, 3 );
 909
 910              m.cost += i_ref_cost;
 911
 912              if( m.cost < l0m->cost )
 913                  *l0m = m;
 914         }
 915         x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, l0m->mv[0], l0m->mv[1] );
 916         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
 917     }
 918
 919     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
 920     if( a->b_mbrd )
 921     {
 922         if( a->i_best_satd > a->l0.i_cost16x8 )
 923             a->i_best_satd = a->l0.i_cost16x8;
 924         h->mb.i_type = P_L0;
 925         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 926     }
 927 }
 928
 929 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 930 {
 931     x264_me_t m;
 932     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 933     int mvc[3][2];
 934     int i, j;
 935
 936     /* XXX Needed for x264_mb_predict_mv */
 937     h->mb.i_partition = D_8x16;
 938
 939     for( i = 0; i < 2; i++ )
 940     {
 941         x264_me_t *l0m = &a->l0.me8x16[i];
 942         const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
 943         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
 944
 945         m.i_pixel = PIXEL_8x16;
 946         m.p_cost_mv = a->p_cost_mv;
 947
 948         LOAD_FENC( &m, p_fenc, 8*i, 0 );
 949         l0m->cost = INT_MAX;
 950         for( j = 0; j < i_ref8s; j++ )
 951         {
 952              const int i_ref = ref8[j];
 953              const int i_ref_cost = REF_COST( 0, i_ref );
 954              m.i_ref_cost = i_ref_cost;
 955              m.i_ref = i_ref;
 956
 957              *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
 958              *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1];
 959              *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3];
 960
 961              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*i, 0 );
 962              x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
 963              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
 964              x264_me_search( h, &m, mvc, 3 );
 965
 966              m.cost += i_ref_cost;
 967
 968              if( m.cost < l0m->cost )
 969                  *l0m = m;
 970         }
 971         x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, l0m->mv[0], l0m->mv[1] );
 972         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
 973     }
 974
 975     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
 976     if( a->b_mbrd )
 977     {
 978         if( a->i_best_satd > a->l0.i_cost8x16 )
 979             a->i_best_satd = a->l0.i_cost8x16;
 980         h->mb.i_type = P_L0;
 981         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 982     }
 983 }
 984
 985 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
 986 {
 987     uint8_t pix1[8*8], pix2[8*8];
 988     const int i_stride = h->mb.pic.i_stride[1];
 989     const int off = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
 990
 991 #define CHROMA4x4MC( width, height, me, x, y ) \
 992     h->mc.mc_chroma( &p_fref[4][off+x+y*i_stride], i_stride, &pix1[x+y*8], 8, (me).mv[0], (me).mv[1], width, height ); \
 993     h->mc.mc_chroma( &p_fref[5][off+x+y*i_stride], i_stride, &pix2[x+y*8], 8, (me).mv[0], (me).mv[1], width, height );
 994
 995     if( pixel == PIXEL_4x4 )
 996     {
 997         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
 998         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 0,2 );
 999         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 2,0 );
1000         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1001     }
1002     else if( pixel == PIXEL_8x4 )
1003     {
1004         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1005         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1006     }
1007     else
1008     {
1009         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1010         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1011     }
1012
1013     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
1014          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
1015 }
1016
1017 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1018 {
1019     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1020     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1021
1022     int i4x4;
1023
1024     /* XXX Needed for x264_mb_predict_mv */
1025     h->mb.i_partition = D_8x8;
1026
1027     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1028     {
1029         const int idx = 4*i8x8 + i4x4;
1030         const int x4 = block_idx_x[idx];
1031         const int y4 = block_idx_y[idx];
1032         const int i_mvc = (i4x4 == 0);
1033
1034         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1035
1036         m->i_pixel = PIXEL_4x4;
1037         m->p_cost_mv = a->p_cost_mv;
1038
1039         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1040         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
1041
1042         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1043         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1044
1045         x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
1046     }
1047     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1048                             a->l0.me4x4[i8x8][1].cost +
1049                             a->l0.me4x4[i8x8][2].cost +
1050                             a->l0.me4x4[i8x8][3].cost +
1051                             REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) +
1052                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1053     if( h->mb.b_chroma_me )
1054         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1055 }
1056
1057 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1058 {
1059     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1060     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1061
1062     int i8x4;
1063
1064     /* XXX Needed for x264_mb_predict_mv */
1065     h->mb.i_partition = D_8x8;
1066
1067     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1068     {
1069         const int idx = 4*i8x8 + 2*i8x4;
1070         const int x4 = block_idx_x[idx];
1071         const int y4 = block_idx_y[idx];
1072         const int i_mvc = (i8x4 == 0);
1073
1074         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1075
1076         m->i_pixel = PIXEL_8x4;
1077         m->p_cost_mv = a->p_cost_mv;
1078
1079         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1080         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
1081
1082         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1083         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1084
1085         x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
1086     }
1087     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1088                             REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) +
1089                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1090     if( h->mb.b_chroma_me )
1091         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1092 }
1093
1094 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1095 {
1096     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1097     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1098
1099     int i4x8;
1100
1101     /* XXX Needed for x264_mb_predict_mv */
1102     h->mb.i_partition = D_8x8;
1103
1104     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1105     {
1106         const int idx = 4*i8x8 + i4x8;
1107         const int x4 = block_idx_x[idx];
1108         const int y4 = block_idx_y[idx];
1109         const int i_mvc = (i4x8 == 0);
1110
1111         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1112
1113         m->i_pixel = PIXEL_4x8;
1114         m->p_cost_mv = a->p_cost_mv;
1115
1116         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1117         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
1118
1119         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1120         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1121
1122         x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
1123     }
1124     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1125                             REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) +
1126                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1127     if( h->mb.b_chroma_me )
1128         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1129 }
1130
1131 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1132 {
1133     /* Assumes that fdec still contains the results of
1134      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1135
1136     uint8_t **p_fenc = h->mb.pic.p_fenc;
1137     uint8_t **p_fdec = h->mb.pic.p_fdec;
1138     int i_stride= h->mb.pic.i_stride[0];
1139     int i;
1140
1141     a->i_cost16x16direct = 0;
1142     for( i = 0; i < 4; i++ )
1143     {
1144         const int x8 = i%2;
1145         const int y8 = i/2;
1146         const int off = 8 * x8 + 8 * i_stride * y8;
1147         a->i_cost16x16direct +=
1148         a->i_cost8x8direct[i] =
1149             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
1150
1151         /* mb type cost */
1152         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1153     }
1154
1155     a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1156 }
1157
1158 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
1159     { \
1160         if( h->param.analyse.b_weighted_bipred ) \
1161             h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \
1162                     h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1163         else \
1164             h->mc.avg[size]( pix1, stride1, src2, stride2 ); \
1165     }
1166
1167 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1168 {
1169     uint8_t pix1[16*16], pix2[16*16];
1170     uint8_t *src2;
1171     int stride2 = 16;
1172     int src2_ref, pix1_ref;
1173
1174     x264_me_t m;
1175     int i_ref;
1176     int mvc[8][2], i_mvc;
1177     int i_fullpel_thresh = INT_MAX;
1178     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
1179
1180     /* 16x16 Search on all ref frame */
1181     m.i_pixel = PIXEL_16x16;
1182     m.p_cost_mv = a->p_cost_mv;
1183     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1184
1185     /* ME for List 0 */
1186     a->l0.me16x16.cost = INT_MAX;
1187     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
1188     {
1189         /* search with ref */
1190         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
1191         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1192         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1193         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
1194
1195         /* add ref cost */
1196         m.cost += REF_COST( 0, i_ref );
1197
1198         if( m.cost < a->l0.me16x16.cost )
1199         {
1200             a->l0.i_ref = i_ref;
1201             a->l0.me16x16 = m;
1202         }
1203
1204         /* save mv for predicting neighbors */
1205         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
1206         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
1207     }
1208     /* subtract ref cost, so we don't have to add it for the other MB types */
1209     a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1210
1211     /* ME for list 1 */
1212     i_fullpel_thresh = INT_MAX;
1213     p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL;
1214     a->l1.me16x16.cost = INT_MAX;
1215     for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
1216     {
1217         /* search with ref */
1218         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 );
1219         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1220         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1221         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
1222
1223         /* add ref cost */
1224         m.cost += REF_COST( 1, i_ref );
1225
1226         if( m.cost < a->l1.me16x16.cost )
1227         {
1228             a->l1.i_ref = i_ref;
1229             a->l1.me16x16 = m;
1230         }
1231
1232         /* save mv for predicting neighbors */
1233         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
1234         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
1235     }
1236     /* subtract ref cost, so we don't have to add it for the other MB types */
1237     a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1238
1239     /* Set global ref, needed for other modes? */
1240     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1241     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1242
1243     /* get cost of BI mode */
1244     if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
1245     {
1246         /* l0 reference is halfpel, so get_ref on it will make it faster */
1247         src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1248                         pix2, &stride2,
1249                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
1250                         16, 16 );
1251         h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1252                         pix1, 16,
1253                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1254                         16, 16 );
1255         src2_ref = a->l0.i_ref;
1256         pix1_ref = a->l1.i_ref;
1257     }
1258     else
1259     {
1260         /* if l0 was qpel, we'll use get_ref on l1 instead */
1261         h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1262                         pix1, 16,
1263                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
1264                         16, 16 );
1265         src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1266                         pix2, &stride2,
1267                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1268                         16, 16 );
1269         src2_ref = a->l1.i_ref;
1270         pix1_ref = a->l0.i_ref;
1271     }
1272
1273     if( h->param.analyse.b_weighted_bipred )
1274         h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
1275                 h->mb.bipred_weight[pix1_ref][src2_ref] );
1276     else
1277         h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
1278
1279     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
1280                      + REF_COST( 0, a->l0.i_ref )
1281                      + REF_COST( 1, a->l1.i_ref )
1282                      + a->l0.me16x16.cost_mv
1283                      + a->l1.me16x16.cost_mv;
1284
1285     /* mb type cost */
1286     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1287     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1288     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1289 }
1290
1291 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1292 {
1293     const int x = 2*(i%2);
1294     const int y = 2*(i/2);
1295
1296     switch( h->mb.i_sub_partition[i] )
1297     {
1298         case D_L0_8x8:
1299             x264_macroblock_cache_mv( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] );
1300             break;
1301         case D_L0_8x4:
1302             x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv[0], a->l0.me8x4[i][0].mv[1] );
1303             x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv[0], a->l0.me8x4[i][1].mv[1] );
1304             break;
1305         case D_L0_4x8:
1306             x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv[0], a->l0.me4x8[i][0].mv[1] );
1307             x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv[0], a->l0.me4x8[i][1].mv[1] );
1308             break;
1309         case D_L0_4x4:
1310             x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv[0], a->l0.me4x4[i][0].mv[1] );
1311             x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv[0], a->l0.me4x4[i][1].mv[1] );
1312             x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv[0], a->l0.me4x4[i][2].mv[1] );
1313             x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv[0], a->l0.me4x4[i][3].mv[1] );
1314             break;
1315         default:
1316             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1317             break;
1318     }
1319 }
1320
1321 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1322     if( x264_mb_partition_listX_table[0][part] ) \
1323     { \
1324         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1325         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
1326     } \
1327     else \
1328     { \
1329         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1330         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0, 0 ); \
1331         if( b_mvd ) \
1332             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
1333     } \
1334     if( x264_mb_partition_listX_table[1][part] ) \
1335     { \
1336         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1337         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
1338     } \
1339     else \
1340     { \
1341         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1342         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0, 0 ); \
1343         if( b_mvd ) \
1344             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
1345     }
1346
1347 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1348 {
1349     int x = (i%2)*2;
1350     int y = (i/2)*2;
1351     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1352     {
1353         x264_mb_load_mv_direct8x8( h, i );
1354         if( b_mvd )
1355         {
1356             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0, 0 );
1357             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0, 0 );
1358             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1359         }
1360     }
1361     else
1362     {
1363         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1364     }
1365 }
1366 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1367 {
1368     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1369 }
1370 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1371 {
1372     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1373 }
1374 #undef CACHE_MV_BI
1375
1376 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1377 {
1378     uint8_t **p_fref[2] =
1379         { h->mb.pic.p_fref[0][a->l0.i_ref],
1380           h->mb.pic.p_fref[1][a->l1.i_ref] };
1381     uint8_t pix[2][8*8];
1382     int i, l;
1383
1384     /* XXX Needed for x264_mb_predict_mv */
1385     h->mb.i_partition = D_8x8;
1386
1387     a->i_cost8x8bi = 0;
1388
1389     for( i = 0; i < 4; i++ )
1390     {
1391         const int x8 = i%2;
1392         const int y8 = i/2;
1393         int i_part_cost;
1394         int i_part_cost_bi = 0;
1395
1396         for( l = 0; l < 2; l++ )
1397         {
1398             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1399             x264_me_t *m = &lX->me8x8[i];
1400
1401             m->i_pixel = PIXEL_8x8;
1402             m->p_cost_mv = a->p_cost_mv;
1403
1404             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1405             LOAD_HPELS( m, p_fref[l], 8*x8, 8*y8 );
1406
1407             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1408             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1409
1410             x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
1411
1412             /* BI mode */
1413             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1414                             m->mv[0], m->mv[1], 8, 8 );
1415             i_part_cost_bi += m->cost_mv;
1416             /* FIXME: ref cost */
1417         }
1418
1419         WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
1420         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
1421                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1422         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1423         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1424
1425         i_part_cost = a->l0.me8x8[i].cost;
1426         h->mb.i_sub_partition[i] = D_L0_8x8;
1427         if( a->l1.me8x8[i].cost < i_part_cost )
1428         {
1429             i_part_cost = a->l1.me8x8[i].cost;
1430             h->mb.i_sub_partition[i] = D_L1_8x8;
1431         }
1432         if( i_part_cost_bi < i_part_cost )
1433         {
1434             i_part_cost = i_part_cost_bi;
1435             h->mb.i_sub_partition[i] = D_BI_8x8;
1436         }
1437         if( a->i_cost8x8direct[i] < i_part_cost )
1438         {
1439             i_part_cost = a->i_cost8x8direct[i];
1440             h->mb.i_sub_partition[i] = D_DIRECT_8x8;
1441         }
1442         a->i_cost8x8bi += i_part_cost;
1443
1444         /* XXX Needed for x264_mb_predict_mv */
1445         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1446     }
1447
1448     /* mb type cost */
1449     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1450 }
1451
1452 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1453 {
1454     uint8_t **p_fref[2] =
1455         { h->mb.pic.p_fref[0][a->l0.i_ref],
1456           h->mb.pic.p_fref[1][a->l1.i_ref] };
1457     uint8_t pix[2][16*8];
1458     int mvc[2][2];
1459     int i, l;
1460
1461     h->mb.i_partition = D_16x8;
1462     a->i_cost16x8bi = 0;
1463
1464     for( i = 0; i < 2; i++ )
1465     {
1466         int i_part_cost;
1467         int i_part_cost_bi = 0;
1468
1469         /* TODO: check only the list(s) that were used in b8x8? */
1470         for( l = 0; l < 2; l++ )
1471         {
1472             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1473             x264_me_t *m = &lX->me16x8[i];
1474
1475             m->i_pixel = PIXEL_16x8;
1476             m->p_cost_mv = a->p_cost_mv;
1477
1478             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1479             LOAD_HPELS( m, p_fref[l], 0, 8*i );
1480
1481             mvc[0][0] = lX->me8x8[2*i].mv[0];
1482             mvc[0][1] = lX->me8x8[2*i].mv[1];
1483             mvc[1][0] = lX->me8x8[2*i+1].mv[0];
1484             mvc[1][1] = lX->me8x8[2*i+1].mv[1];
1485
1486             x264_mb_predict_mv( h, 0, 8*i, 2, m->mvp );
1487             x264_me_search( h, m, mvc, 2 );
1488
1489             /* BI mode */
1490             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
1491                             m->mv[0], m->mv[1], 16, 8 );
1492             /* FIXME: ref cost */
1493             i_part_cost_bi += m->cost_mv;
1494         }
1495
1496         WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
1497         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
1498
1499         i_part_cost = a->l0.me16x8[i].cost;
1500         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1501         if( a->l1.me16x8[i].cost < i_part_cost )
1502         {
1503             i_part_cost = a->l1.me16x8[i].cost;
1504             a->i_mb_partition16x8[i] = D_L1_8x8;
1505         }
1506         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1507         {
1508             i_part_cost = i_part_cost_bi;
1509             a->i_mb_partition16x8[i] = D_BI_8x8;
1510         }
1511         a->i_cost16x8bi += i_part_cost;
1512
1513         if( i == 0 )
1514             x264_mb_cache_mv_b16x8( h, a, i, 0 );
1515     }
1516
1517     /* mb type cost */
1518     a->i_mb_type16x8 = B_L0_L0
1519         + (a->i_mb_partition16x8[0]>>2) * 3
1520         + (a->i_mb_partition16x8[1]>>2);
1521     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1522 }
1523 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1524 {
1525     uint8_t **p_fref[2] =
1526         { h->mb.pic.p_fref[0][a->l0.i_ref],
1527           h->mb.pic.p_fref[1][a->l1.i_ref] };
1528     uint8_t pix[2][8*16];
1529     int mvc[2][2];
1530     int i, l;
1531
1532     h->mb.i_partition = D_8x16;
1533     a->i_cost8x16bi = 0;
1534
1535     for( i = 0; i < 2; i++ )
1536     {
1537         int i_part_cost;
1538         int i_part_cost_bi = 0;
1539
1540         for( l = 0; l < 2; l++ )
1541         {
1542             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1543             x264_me_t *m = &lX->me8x16[i];
1544
1545             m->i_pixel = PIXEL_8x16;
1546             m->p_cost_mv = a->p_cost_mv;
1547
1548             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1549             LOAD_HPELS( m, p_fref[l], 8*i, 0 );
1550
1551             mvc[0][0] = lX->me8x8[i].mv[0];
1552             mvc[0][1] = lX->me8x8[i].mv[1];
1553             mvc[1][0] = lX->me8x8[i+2].mv[0];
1554             mvc[1][1] = lX->me8x8[i+2].mv[1];
1555
1556             x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1557             x264_me_search( h, m, mvc, 2 );
1558
1559             /* BI mode */
1560             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1561                             m->mv[0], m->mv[1], 8, 16 );
1562             /* FIXME: ref cost */
1563             i_part_cost_bi += m->cost_mv;
1564         }
1565
1566         WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
1567         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
1568
1569         i_part_cost = a->l0.me8x16[i].cost;
1570         a->i_mb_partition8x16[i] = D_L0_8x8;
1571         if( a->l1.me8x16[i].cost < i_part_cost )
1572         {
1573             i_part_cost = a->l1.me8x16[i].cost;
1574             a->i_mb_partition8x16[i] = D_L1_8x8;
1575         }
1576         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1577         {
1578             i_part_cost = i_part_cost_bi;
1579             a->i_mb_partition8x16[i] = D_BI_8x8;
1580         }
1581         a->i_cost8x16bi += i_part_cost;
1582
1583         if( i == 0 )
1584             x264_mb_cache_mv_b8x16( h, a, i, 0 );
1585     }
1586
1587     /* mb type cost */
1588     a->i_mb_type8x16 = B_L0_L0
1589         + (a->i_mb_partition8x16[0]>>2) * 3
1590         + (a->i_mb_partition8x16[1]>>2);
1591     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1592 }
1593
1594 static inline void x264_mb_analyse_transform( x264_t *h )
1595 {
1596     h->mb.cache.b_transform_8x8_allowed =
1597         h->param.analyse.b_transform_8x8
1598         && !IS_INTRA( h->mb.i_type ) && x264_mb_transform_8x8_allowed( h );
1599
1600     if( h->mb.cache.b_transform_8x8_allowed )
1601     {
1602         int i_cost4, i_cost8;
1603         /* FIXME only luma mc is needed */
1604         x264_mb_mc( h );
1605
1606         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
1607                                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
1608         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
1609                                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
1610
1611         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
1612     }
1613 }
1614
1615 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_cost )
1616 {
1617     h->mb.cache.b_transform_8x8_allowed =
1618         h->param.analyse.b_transform_8x8 && x264_mb_transform_8x8_allowed( h );
1619
1620     if( h->mb.cache.b_transform_8x8_allowed )
1621     {
1622         int i_cost8;
1623         x264_analyse_update_cache( h, a );
1624         h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
1625         /* FIXME only luma is needed, but the score for comparison already includes chroma */
1626         i_cost8 = x264_rd_cost_mb( h, a->i_lambda2 );
1627
1628         if( *i_cost >= i_cost8 )
1629         {
1630             if( *i_cost > 0 )
1631                 a->i_best_satd = (int64_t)a->i_best_satd * i_cost8 / *i_cost;
1632             *i_cost = i_cost8;
1633         }
1634         else
1635             h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
1636     }
1637 }
1638
1639
1640 /*****************************************************************************
1641  * x264_macroblock_analyse:
1642  *****************************************************************************/
1643 void x264_macroblock_analyse( x264_t *h )
1644 {
1645     x264_mb_analysis_t analysis;
1646     int i_cost = COST_MAX;
1647     int i;
1648
1649     /* init analysis */
1650     x264_mb_analyse_init( h, &analysis, x264_ratecontrol_qp( h ) );
1651
1652     /*--------------------------- Do the analysis ---------------------------*/
1653     if( h->sh.i_type == SLICE_TYPE_I )
1654     {
1655         x264_mb_analyse_intra( h, &analysis, COST_MAX );
1656
1657         i_cost = analysis.i_sad_i16x16;
1658         h->mb.i_type = I_16x16;
1659         if( analysis.i_sad_i4x4 < i_cost )
1660         {
1661             i_cost = analysis.i_sad_i4x4;
1662             h->mb.i_type = I_4x4;
1663         }
1664         if( analysis.i_sad_i8x8 < i_cost )
1665             h->mb.i_type = I_8x8;
1666     }
1667     else if( h->sh.i_type == SLICE_TYPE_P )
1668     {
1669         int b_skip = 0;
1670         int i_intra_cost, i_intra_type;
1671
1672         /* Fast P_SKIP detection */
1673         if( !h->mb.b_lossless &&
1674            (( h->mb.i_mb_type_left == P_SKIP ) ||
1675             ( h->mb.i_mb_type_top == P_SKIP ) ||
1676             ( h->mb.i_mb_type_topleft == P_SKIP ) ||
1677             ( h->mb.i_mb_type_topright == P_SKIP )))
1678         {
1679             b_skip = x264_macroblock_probe_pskip( h );
1680         }
1681
1682         if( b_skip )
1683         {
1684             h->mb.i_type = P_SKIP;
1685             h->mb.i_partition = D_16x16;
1686         }
1687         else
1688         {
1689             const unsigned int flags = h->param.analyse.inter;
1690             int i_type;
1691             int i_partition;
1692             int i_thresh16x8;
1693
1694             x264_mb_analyse_load_costs( h, &analysis );
1695
1696             x264_mb_analyse_inter_p16x16( h, &analysis );
1697             if( flags & X264_ANALYSE_PSUB16x16 )
1698             {
1699                 if( h->param.analyse.b_mixed_references )
1700                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
1701                 else
1702                     x264_mb_analyse_inter_p8x8( h, &analysis );
1703             }
1704
1705             /* Select best inter mode */
1706             i_type = P_L0;
1707             i_partition = D_16x16;
1708             i_cost = analysis.l0.me16x16.cost;
1709
1710             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1711                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
1712             {
1713                 int i;
1714
1715                 i_type = P_8x8;
1716                 i_partition = D_8x8;
1717                 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1718                 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1719
1720                 i_cost = analysis.l0.i_cost8x8;
1721
1722                 /* Do sub 8x8 */
1723                 if( flags & X264_ANALYSE_PSUB8x8 )
1724                 {
1725                     int i_cost_bak = i_cost;
1726                     int b_sub8x8 = 0;
1727                     for( i = 0; i < 4; i++ )
1728                     {
1729                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
1730                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
1731                         {
1732                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
1733                             h->mb.i_sub_partition[i] = D_L0_4x4;
1734
1735                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
1736                             if( analysis.l0.i_cost8x4[i] < i_cost8x8 )
1737                             {
1738                                 h->mb.i_sub_partition[i] = D_L0_8x4;
1739                                 i_cost8x8 = analysis.l0.i_cost8x4[i];
1740                             }
1741
1742                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
1743                             if( analysis.l0.i_cost4x8[i] < i_cost8x8 )
1744                             {
1745                                 h->mb.i_sub_partition[i] = D_L0_4x8;
1746                                 i_cost8x8 = analysis.l0.i_cost4x8[i];
1747                             }
1748
1749                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
1750                             b_sub8x8 = 1;
1751                         }
1752                         x264_mb_cache_mv_p8x8( h, &analysis, i );
1753                     }
1754                     /* TODO: RD per subpartition */
1755                     if( b_sub8x8 && analysis.b_mbrd )
1756                     {
1757                         i_cost = x264_rd_cost_mb( h, analysis.i_lambda2 );
1758                         if( i_cost > i_cost_bak )
1759                         {
1760                             i_cost = i_cost_bak;
1761                             h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1762                             h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1763                         }
1764                     }
1765                 }
1766             }
1767
1768             /* Now do 16x8/8x16 */
1769             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
1770             if( analysis.b_mbrd )
1771                 i_thresh16x8 = i_thresh16x8 * analysis.i_lambda2 / analysis.i_lambda;
1772             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1773                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
1774             {
1775                 x264_mb_analyse_inter_p16x8( h, &analysis );
1776                 if( analysis.l0.i_cost16x8 < i_cost )
1777                 {
1778                     i_type = P_L0;
1779                     i_partition = D_16x8;
1780                     i_cost = analysis.l0.i_cost16x8;
1781                 }
1782
1783                 x264_mb_analyse_inter_p8x16( h, &analysis );
1784                 if( analysis.l0.i_cost8x16 < i_cost )
1785                 {
1786                     i_type = P_L0;
1787                     i_partition = D_8x16;
1788                     i_cost = analysis.l0.i_cost8x16;
1789                 }
1790             }
1791
1792             h->mb.i_partition = i_partition;
1793
1794             /* refine qpel */
1795             //FIXME mb_type costs?
1796             if( analysis.b_mbrd )
1797             {
1798                 h->mb.i_type = i_type;
1799                 x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
1800             }
1801             else if( i_partition == D_16x16 )
1802             {
1803                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1804                 i_cost = analysis.l0.me16x16.cost;
1805             }
1806             else if( i_partition == D_16x8 )
1807             {
1808                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
1809                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
1810                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
1811             }
1812             else if( i_partition == D_8x16 )
1813             {
1814                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
1815                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
1816                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
1817             }
1818             else if( i_partition == D_8x8 )
1819             {
1820                 int i8x8;
1821                 i_cost = 0;
1822                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
1823                 {
1824                     switch( h->mb.i_sub_partition[i8x8] )
1825                     {
1826                         case D_L0_8x8:
1827                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
1828                             i_cost += analysis.l0.me8x8[i8x8].cost;
1829                             break;
1830                         case D_L0_8x4:
1831                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
1832                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
1833                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
1834                                       analysis.l0.me8x4[i8x8][1].cost;
1835                             break;
1836                         case D_L0_4x8:
1837                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
1838                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
1839                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
1840                                       analysis.l0.me4x8[i8x8][1].cost;
1841                             break;
1842
1843                         case D_L0_4x4:
1844                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
1845                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
1846                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
1847                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
1848                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
1849                                       analysis.l0.me4x4[i8x8][1].cost +
1850                                       analysis.l0.me4x4[i8x8][2].cost +
1851                                       analysis.l0.me4x4[i8x8][3].cost;
1852                             break;
1853                         default:
1854                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
1855                             break;
1856                     }
1857                 }
1858             }
1859
1860             x264_mb_analyse_intra( h, &analysis, i_cost );
1861             if( h->mb.b_chroma_me && !analysis.b_mbrd &&
1862                 ( analysis.i_sad_i16x16 < i_cost
1863                || analysis.i_sad_i8x8 < i_cost
1864                || analysis.i_sad_i4x4 < i_cost ))
1865             {
1866                 x264_mb_analyse_intra_chroma( h, &analysis );
1867                 analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma;
1868                 analysis.i_sad_i8x8 += analysis.i_sad_i8x8chroma;
1869                 analysis.i_sad_i4x4 += analysis.i_sad_i8x8chroma;
1870             }
1871
1872             i_intra_type = I_16x16;
1873             i_intra_cost = analysis.i_sad_i16x16;
1874
1875             if( analysis.i_sad_i8x8 < i_intra_cost )
1876             {
1877                 i_intra_type = I_8x8;
1878                 i_intra_cost = analysis.i_sad_i8x8;
1879             }
1880             if( analysis.i_sad_i4x4 < i_intra_cost )
1881             {
1882                 i_intra_type = I_4x4;
1883                 i_intra_cost = analysis.i_sad_i4x4;
1884             }
1885
1886             if( i_intra_cost < i_cost )
1887             {
1888                 i_type = i_intra_type;
1889                 i_cost = i_intra_cost;
1890             }
1891
1892             h->mb.i_type = i_type;
1893             h->stat.frame.i_intra_cost += i_intra_cost;
1894             h->stat.frame.i_inter_cost += i_cost;
1895         }
1896     }
1897     else if( h->sh.i_type == SLICE_TYPE_B )
1898     {
1899         int b_skip = 0;
1900
1901         analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h );
1902         if( analysis.b_direct_available )
1903         {
1904             h->mb.i_type = B_SKIP;
1905             x264_mb_mc( h );
1906
1907             /* Conditioning the probe on neighboring block types
1908              * doesn't seem to help speed or quality. */
1909             b_skip = !h->mb.b_lossless && x264_macroblock_probe_bskip( h );
1910         }
1911
1912         if( !b_skip )
1913         {
1914             const unsigned int flags = h->param.analyse.inter;
1915             int i_partition;
1916
1917             x264_mb_analyse_load_costs( h, &analysis );
1918
1919             /* select best inter mode */
1920             /* direct must be first */
1921             if( analysis.b_direct_available )
1922                 x264_mb_analyse_inter_direct( h, &analysis );
1923
1924             x264_mb_analyse_inter_b16x16( h, &analysis );
1925
1926             h->mb.i_type = B_L0_L0;
1927             i_partition = D_16x16;
1928             i_cost = analysis.l0.me16x16.cost;
1929             if( analysis.l1.me16x16.cost < i_cost )
1930             {
1931                 h->mb.i_type = B_L1_L1;
1932                 i_cost = analysis.l1.me16x16.cost;
1933             }
1934             if( analysis.i_cost16x16bi < i_cost )
1935             {
1936                 h->mb.i_type = B_BI_BI;
1937                 i_cost = analysis.i_cost16x16bi;
1938             }
1939             if( analysis.i_cost16x16direct < i_cost )
1940             {
1941                 h->mb.i_type = B_DIRECT;
1942                 i_cost = analysis.i_cost16x16direct;
1943             }
1944
1945             if( flags & X264_ANALYSE_BSUB16x16 )
1946             {
1947                 x264_mb_analyse_inter_b8x8( h, &analysis );
1948                 if( analysis.i_cost8x8bi < i_cost )
1949                 {
1950                     h->mb.i_type = B_8x8;
1951                     i_partition = D_8x8;
1952                     i_cost = analysis.i_cost8x8bi;
1953
1954                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
1955                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
1956                     {
1957                         x264_mb_analyse_inter_b16x8( h, &analysis );
1958                         if( analysis.i_cost16x8bi < i_cost )
1959                         {
1960                             i_partition = D_16x8;
1961                             i_cost = analysis.i_cost16x8bi;
1962                             h->mb.i_type = analysis.i_mb_type16x8;
1963                         }
1964                     }
1965                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
1966                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
1967                     {
1968                         x264_mb_analyse_inter_b8x16( h, &analysis );
1969                         if( analysis.i_cost8x16bi < i_cost )
1970                         {
1971                             i_partition = D_8x16;
1972                             i_cost = analysis.i_cost8x16bi;
1973                             h->mb.i_type = analysis.i_mb_type8x16;
1974                         }
1975                     }
1976                 }
1977             }
1978
1979             h->mb.i_partition = i_partition;
1980
1981             /* refine qpel */
1982             if( i_partition == D_16x16 )
1983             {
1984                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1985                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1986                 if( h->mb.i_type == B_L0_L0 )
1987                 {
1988                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1989                     i_cost = analysis.l0.me16x16.cost
1990                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1991                 }
1992                 else if( h->mb.i_type == B_L1_L1 )
1993                 {
1994                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1995                     i_cost = analysis.l1.me16x16.cost
1996                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1997                 }
1998                 else if( h->mb.i_type == B_BI_BI )
1999                 {
2000                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2001                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2002                 }
2003             }
2004             else if( i_partition == D_16x8 )
2005             {
2006                 for( i=0; i<2; i++ )
2007                 {
2008                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2009                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2010                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2011                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2012                 }
2013             }
2014             else if( i_partition == D_8x16 )
2015             {
2016                 for( i=0; i<2; i++ )
2017                 {
2018                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2019                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2020                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2021                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2022                 }
2023             }
2024             else if( i_partition == D_8x8 )
2025             {
2026                 for( i=0; i<4; i++ )
2027                 {
2028                     x264_me_t *m;
2029                     int i_part_cost_old;
2030                     int i_type_cost;
2031                     int i_part_type = h->mb.i_sub_partition[i];
2032                     int b_bidir = (i_part_type == D_BI_8x8);
2033
2034                     if( i_part_type == D_DIRECT_8x8 )
2035                         continue;
2036                     if( x264_mb_partition_listX_table[0][i_part_type] )
2037                     {
2038                         m = &analysis.l0.me8x8[i];
2039                         i_part_cost_old = m->cost;
2040                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2041                         m->cost -= i_type_cost;
2042                         x264_me_refine_qpel( h, m );
2043                         if( !b_bidir )
2044                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2045                     }
2046                     if( x264_mb_partition_listX_table[1][i_part_type] )
2047                     {
2048                         m = &analysis.l1.me8x8[i];
2049                         i_part_cost_old = m->cost;
2050                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2051                         m->cost -= i_type_cost;
2052                         x264_me_refine_qpel( h, m );
2053                         if( !b_bidir )
2054                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2055                     }
2056                     /* TODO: update mvp? */
2057                 }
2058             }
2059
2060             /* best intra mode */
2061             x264_mb_analyse_intra( h, &analysis, i_cost );
2062
2063             if( analysis.i_sad_i16x16 < i_cost )
2064             {
2065                 h->mb.i_type = I_16x16;
2066                 i_cost = analysis.i_sad_i16x16;
2067             }
2068             if( analysis.i_sad_i8x8 < i_cost )
2069             {
2070                 h->mb.i_type = I_8x8;
2071                 i_cost = analysis.i_sad_i8x8;
2072             }
2073             if( analysis.i_sad_i4x4 < i_cost )
2074             {
2075                 h->mb.i_type = I_4x4;
2076                 i_cost = analysis.i_sad_i4x4;
2077             }
2078         }
2079     }
2080
2081     x264_analyse_update_cache( h, &analysis );
2082
2083     if( !analysis.b_mbrd )
2084         x264_mb_analyse_transform( h );
2085 }
2086
2087 /*-------------------- Update MB from the analysis ----------------------*/
2088 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
2089 {
2090     int i;
2091
2092     switch( h->mb.i_type )
2093     {
2094         case I_4x4:
2095             for( i = 0; i < 16; i++ )
2096             {
2097                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
2098                     a->i_predict4x4[block_idx_x[i]][block_idx_y[i]];
2099             }
2100
2101             x264_mb_analyse_intra_chroma( h, a );
2102             break;
2103         case I_8x8:
2104             for( i = 0; i < 4; i++ )
2105                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1),
2106                     a->i_predict8x8[i&1][i>>1] );
2107
2108             x264_mb_analyse_intra_chroma( h, a );
2109             break;
2110         case I_16x16:
2111             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2112             x264_mb_analyse_intra_chroma( h, a );
2113             break;
2114
2115         case P_L0:
2116             switch( h->mb.i_partition )
2117             {
2118                 case D_16x16:
2119                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2120                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
2121                     break;
2122
2123                 case D_16x8:
2124                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2125                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2126                     x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] );
2127                     x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] );
2128                     break;
2129
2130                 case D_8x16:
2131                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2132                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2133                     x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] );
2134                     x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] );
2135                     break;
2136
2137                 default:
2138                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
2139                     break;
2140             }
2141             break;
2142
2143         case P_8x8:
2144             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2145             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2146             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2147             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2148             for( i = 0; i < 4; i++ )
2149                 x264_mb_cache_mv_p8x8( h, a, i );
2150             break;
2151
2152         case P_SKIP:
2153         {
2154             int mvp[2];
2155             x264_mb_predict_mv_pskip( h, mvp );
2156             /* */
2157             h->mb.i_partition = D_16x16;
2158             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
2159             x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
2160             break;
2161         }
2162
2163         case B_SKIP:
2164             /* nothing has changed since x264_macroblock_probe_bskip */
2165             break;
2166         case B_DIRECT:
2167             x264_mb_load_mv_direct8x8( h, 0 );
2168             x264_mb_load_mv_direct8x8( h, 1 );
2169             x264_mb_load_mv_direct8x8( h, 2 );
2170             x264_mb_load_mv_direct8x8( h, 3 );
2171             break;
2172
2173         case B_8x8:
2174             /* optimize: cache might not need to be rewritten */
2175             for( i = 0; i < 4; i++ )
2176                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
2177             break;
2178
2179         default: /* the rest of the B types */
2180             switch( h->mb.i_partition )
2181             {
2182             case D_16x16:
2183                 switch( h->mb.i_type )
2184                 {
2185                 case B_L0_L0:
2186                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2187                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
2188
2189                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
2190                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
2191                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
2192                     break;
2193                 case B_L1_L1:
2194                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
2195                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
2196                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
2197
2198                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
2199                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
2200                     break;
2201                 case B_BI_BI:
2202                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2203                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
2204
2205                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
2206                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
2207                     break;
2208                 }
2209                 break;
2210             case D_16x8:
2211                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
2212                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
2213                 break;
2214             case D_8x16:
2215                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
2216                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
2217                 break;
2218             default:
2219                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
2220                 break;
2221             }
2222     }
2223 }
2224
2225 #include "slicetype_decision.c"
2226