git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27 #include <math.h>
  28 #include <limits.h>
  29
  30 #include "common/common.h"
  31 #include "common/macroblock.h"
  32 #include "macroblock.h"
  33 #include "me.h"
  34 #include "ratecontrol.h"
  35
  36 typedef struct
  37 {
  38     /* 16x16 */
  39     int i_ref;
  40     x264_me_t me16x16;
  41
  42     /* 8x8 */
  43     int       i_cost8x8;
  44     x264_me_t me8x8[4];
  45
  46     /* Sub 4x4 */
  47     int       i_cost4x4[4]; /* cost per 8x8 partition */
  48     x264_me_t me4x4[4][4];
  49
  50     /* Sub 8x4 */
  51     int       i_cost8x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me8x4[4][2];
  53
  54     /* Sub 4x8 */
  55     int       i_cost4x8[4]; /* cost per 8x8 partition */
  56     x264_me_t me4x8[4][4];
  57
  58     /* 16x8 */
  59     int       i_cost16x8;
  60     x264_me_t me16x8[2];
  61
  62     /* 8x16 */
  63     int       i_cost8x16;
  64     x264_me_t me8x16[2];
  65
  66 } x264_mb_analysis_list_t;
  67
  68 typedef struct
  69 {
  70     /* conduct the analysis using this lamda and QP */
  71     int i_lambda;
  72     int i_qp;
  73     int16_t *p_cost_mv;
  74
  75
  76     /* I: Intra part */
  77     /* Luma part 16x16 and 4x4 modes stats */
  78     int i_sad_i16x16;
  79     int i_predict16x16;
  80
  81     int i_sad_i4x4;
  82     int i_predict4x4[4][4];
  83
  84     /* Chroma part */
  85     int i_sad_i8x8;
  86     int i_predict8x8;
  87
  88     /* II: Inter part P/B frame */
  89     x264_mb_analysis_list_t l0;
  90     x264_mb_analysis_list_t l1;
  91
  92     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
  93     int i_cost16x16direct;
  94     int i_cost8x8bi;
  95     int i_cost8x8direct[4];
  96     int i_cost16x8bi;
  97     int i_cost8x16bi;
  98
  99     int i_mb_partition16x8[2]; /* mb_partition_e */
 100     int i_mb_partition8x16[2];
 101     int i_mb_type16x8; /* mb_class_e */
 102     int i_mb_type8x16;
 103
 104     int b_direct_available;
 105
 106 } x264_mb_analysis_t;
 107
 108 static const int i_qp0_cost_table[52] = {
 109    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 110    1, 1, 1, 1,              /*  8-11 */
 111    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 112    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 113    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 114   16,18,20,23,25,29,32,36,  /* 36-43 */
 115   40,45,51,57,64,72,81,91   /* 44-51 */
 116 };
 117
 118 static const uint8_t block_idx_x[16] = {
 119     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
 120 };
 121 static const uint8_t block_idx_y[16] = {
 122     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
 123 };
 124
 125 /* TODO: calculate CABAC costs */
 126 static const int i_mb_b_cost_table[18] = {
 127     9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 128 };
 129 static const int i_mb_b16x8_cost_table[16] = {
 130     0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 131 };
 132 static const int i_sub_mb_b_cost_table[13] = {
 133     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 134 };
 135 static const int i_sub_mb_p_cost_table[4] = {
 136     5, 3, 3, 1
 137 };
 138
 139 /* initialize an array of lambda*nbits for all possible mvs */
 140 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 141 {
 142     static int16_t *p_cost_mv[52];
 143
 144     if( !p_cost_mv[a->i_qp] )
 145     {
 146         /* could be faster, but isn't called many times */
 147         /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 148         int i;
 149         p_cost_mv[a->i_qp] = x264_malloc( (4*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) );
 150         p_cost_mv[a->i_qp] += 2*4*h->param.analyse.i_mv_range;
 151         for( i = 0; i <= 2*4*h->param.analyse.i_mv_range; i++ )
 152         {
 153             p_cost_mv[a->i_qp][-i] =
 154             p_cost_mv[a->i_qp][i]  = a->i_lambda * bs_size_se( i );
 155         }
 156     }
 157
 158     a->p_cost_mv = p_cost_mv[a->i_qp];
 159 }
 160
 161 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 162 {
 163     memset( a, 0, sizeof( x264_mb_analysis_t ) );
 164
 165     /* conduct the analysis using this lamda and QP */
 166     a->i_qp = i_qp;
 167     a->i_lambda = i_qp0_cost_table[i_qp];
 168
 169     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 170     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 171                         && h->mb.i_subpel_refine >= 5;
 172
 173     /* I: Intra part */
 174     a->i_sad_i16x16 =
 175     a->i_sad_i4x4   =
 176     a->i_sad_i8x8   = COST_MAX;
 177
 178     /* II: Inter part P/B frame */
 179     if( h->sh.i_type != SLICE_TYPE_I )
 180     {
 181         int i;
 182         int i_fmv_range = h->param.analyse.i_mv_range - 16;
 183
 184         /* Calculate max allowed MV range */
 185 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
 186         h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
 187         h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 );
 188         h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
 189         h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
 190         if( h->mb.i_mb_x == 0)
 191         {
 192             h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
 193             h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 );
 194             h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
 195             h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
 196         }
 197 #undef CLIP_FMV
 198
 199         a->l0.me16x16.cost =
 200         a->l0.i_cost8x8    = COST_MAX;
 201
 202         for( i = 0; i < 4; i++ )
 203         {
 204             a->l0.i_cost4x4[i] =
 205             a->l0.i_cost8x4[i] =
 206             a->l0.i_cost4x8[i] = COST_MAX;
 207         }
 208
 209         a->l0.i_cost16x8   =
 210         a->l0.i_cost8x16   = COST_MAX;
 211         if( h->sh.i_type == SLICE_TYPE_B )
 212         {
 213             a->l1.me16x16.cost =
 214             a->l1.i_cost8x8    = COST_MAX;
 215
 216             for( i = 0; i < 4; i++ )
 217             {
 218                 a->l1.i_cost4x4[i] =
 219                 a->l1.i_cost8x4[i] =
 220                 a->l1.i_cost4x8[i] =
 221                 a->i_cost8x8direct[i] = COST_MAX;
 222             }
 223
 224             a->l1.i_cost16x8   =
 225             a->l1.i_cost8x16   =
 226
 227             a->i_cost16x16bi   =
 228             a->i_cost16x16direct =
 229             a->i_cost8x8bi     =
 230             a->i_cost16x8bi    =
 231             a->i_cost8x16bi    = COST_MAX;
 232         }
 233     }
 234 }
 235
 236
 237
 238 /*
 239  * Handle intra mb
 240  */
 241 /* Max = 4 */
 242 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 243 {
 244     if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
 245     {
 246         /* top and left avaible */
 247         *mode++ = I_PRED_16x16_V;
 248         *mode++ = I_PRED_16x16_H;
 249         *mode++ = I_PRED_16x16_DC;
 250         *mode++ = I_PRED_16x16_P;
 251         *pi_count = 4;
 252     }
 253     else if( ( i_neighbour & MB_LEFT ) )
 254     {
 255         /* left available*/
 256         *mode++ = I_PRED_16x16_DC_LEFT;
 257         *mode++ = I_PRED_16x16_H;
 258         *pi_count = 2;
 259     }
 260     else if( ( i_neighbour & MB_TOP ) )
 261     {
 262         /* top available*/
 263         *mode++ = I_PRED_16x16_DC_TOP;
 264         *mode++ = I_PRED_16x16_V;
 265         *pi_count = 2;
 266     }
 267     else
 268     {
 269         /* none avaible */
 270         *mode = I_PRED_16x16_DC_128;
 271         *pi_count = 1;
 272     }
 273 }
 274
 275 /* Max = 4 */
 276 static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 277 {
 278     if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
 279     {
 280         /* top and left avaible */
 281         *mode++ = I_PRED_CHROMA_V;
 282         *mode++ = I_PRED_CHROMA_H;
 283         *mode++ = I_PRED_CHROMA_DC;
 284         *mode++ = I_PRED_CHROMA_P;
 285         *pi_count = 4;
 286     }
 287     else if( ( i_neighbour & MB_LEFT ) )
 288     {
 289         /* left available*/
 290         *mode++ = I_PRED_CHROMA_DC_LEFT;
 291         *mode++ = I_PRED_CHROMA_H;
 292         *pi_count = 2;
 293     }
 294     else if( ( i_neighbour & MB_TOP ) )
 295     {
 296         /* top available*/
 297         *mode++ = I_PRED_CHROMA_DC_TOP;
 298         *mode++ = I_PRED_CHROMA_V;
 299         *pi_count = 2;
 300     }
 301     else
 302     {
 303         /* none avaible */
 304         *mode = I_PRED_CHROMA_DC_128;
 305         *pi_count = 1;
 306     }
 307 }
 308
 309 /* MAX = 8 */
 310 static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
 311 {
 312     int b_a, b_b, b_c;
 313     static const unsigned int needmb[16] =
 314     {
 315         MB_LEFT|MB_TOP, MB_TOP,
 316         MB_LEFT,        MB_PRIVATE,
 317         MB_TOP,         MB_TOP|MB_TOPRIGHT,
 318         0,              MB_PRIVATE,
 319         MB_LEFT,        0,
 320         MB_LEFT,        MB_PRIVATE,
 321         0,              MB_PRIVATE,
 322         0,              MB_PRIVATE
 323     };
 324
 325     /* FIXME even when b_c == 0 there is some case where missing pixels
 326      * are emulated and thus more mode are available TODO
 327      * analysis and encode should be fixed too */
 328     b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
 329     b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
 330     b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
 331
 332     if( b_a && b_b )
 333     {
 334         *mode++ = I_PRED_4x4_DC;
 335         *mode++ = I_PRED_4x4_H;
 336         *mode++ = I_PRED_4x4_V;
 337         *mode++ = I_PRED_4x4_DDR;
 338         *mode++ = I_PRED_4x4_VR;
 339         *mode++ = I_PRED_4x4_HD;
 340         *mode++ = I_PRED_4x4_HU;
 341
 342         *pi_count = 7;
 343
 344         if( b_c )
 345         {
 346             *mode++ = I_PRED_4x4_DDL;
 347             *mode++ = I_PRED_4x4_VL;
 348             (*pi_count) += 2;
 349         }
 350     }
 351     else if( b_a && !b_b )
 352     {
 353         *mode++ = I_PRED_4x4_DC_LEFT;
 354         *mode++ = I_PRED_4x4_H;
 355         *mode++ = I_PRED_4x4_HU;
 356         *pi_count = 3;
 357     }
 358     else if( !b_a && b_b )
 359     {
 360         *mode++ = I_PRED_4x4_DC_TOP;
 361         *mode++ = I_PRED_4x4_V;
 362         *pi_count = 2;
 363     }
 364     else
 365     {
 366         *mode++ = I_PRED_4x4_DC_128;
 367         *pi_count = 1;
 368     }
 369 }
 370
 371 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res )
 372 {
 373     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 374     const int i_stride = h->mb.pic.i_stride[0];
 375     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 376     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 377
 378     int i, idx;
 379
 380     int i_max;
 381     int predict_mode[9];
 382
 383     /*---------------- Try all mode and calculate their score ---------------*/
 384
 385     /* 16x16 prediction selection */
 386     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 387     for( i = 0; i < i_max; i++ )
 388     {
 389         int i_sad;
 390         int i_mode;
 391
 392         i_mode = predict_mode[i];
 393
 394         /* we do the prediction */
 395         h->predict_16x16[i_mode]( p_dst, i_stride );
 396
 397         /* we calculate the diff and get the square sum of the diff */
 398         i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
 399                 res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 400         /* if i_score is lower it is better */
 401         if( res->i_sad_i16x16 > i_sad )
 402         {
 403             res->i_predict16x16 = i_mode;
 404             res->i_sad_i16x16     = i_sad;
 405         }
 406     }
 407     /* cavlc mb type prefix */
 408     if( h->sh.i_type == SLICE_TYPE_B )
 409         res->i_sad_i16x16 += res->i_lambda * i_mb_b_cost_table[I_16x16];
 410
 411     /* 4x4 prediction selection */
 412     if( flags & X264_ANALYSE_I4x4 )
 413     {
 414         res->i_sad_i4x4 = 0;
 415         for( idx = 0; idx < 16; idx++ )
 416         {
 417             uint8_t *p_src_by;
 418             uint8_t *p_dst_by;
 419             int     i_best;
 420             int x, y;
 421             int i_pred_mode;
 422
 423             i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
 424             x = block_idx_x[idx];
 425             y = block_idx_y[idx];
 426
 427             p_src_by = p_src + 4 * x + 4 * y * i_stride;
 428             p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
 429
 430             i_best = COST_MAX;
 431             predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
 432             for( i = 0; i < i_max; i++ )
 433             {
 434                 int i_sad;
 435                 int i_mode;
 436
 437                 i_mode = predict_mode[i];
 438
 439                 /* we do the prediction */
 440                 h->predict_4x4[i_mode]( p_dst_by, i_stride );
 441
 442                 /* we calculate diff and get the square sum of the diff */
 443                 i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
 444                                                  p_src_by, i_stride );
 445
 446                 i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
 447
 448                 /* if i_score is lower it is better */
 449                 if( i_best > i_sad )
 450                 {
 451                     res->i_predict4x4[x][y] = i_mode;
 452                     i_best = i_sad;
 453                 }
 454             }
 455             res->i_sad_i4x4 += i_best;
 456
 457             /* we need to encode this mb now (for next ones) */
 458             h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
 459             x264_mb_encode_i4x4( h, idx, res->i_qp );
 460
 461             /* we need to store the 'fixed' version */
 462             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
 463                 x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
 464         }
 465         res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
 466         if( h->sh.i_type == SLICE_TYPE_B )
 467             res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
 468     }
 469 }
 470
 471 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
 472 {
 473     int i;
 474
 475     int i_max;
 476     int predict_mode[9];
 477
 478     uint8_t *p_dstc[2], *p_srcc[2];
 479     int      i_stride[2];
 480
 481     if( res->i_sad_i8x8 < COST_MAX )
 482         return;
 483
 484     /* 8x8 prediction selection for chroma */
 485     p_dstc[0] = h->mb.pic.p_fdec[1];
 486     p_dstc[1] = h->mb.pic.p_fdec[2];
 487     p_srcc[0] = h->mb.pic.p_fenc[1];
 488     p_srcc[1] = h->mb.pic.p_fenc[2];
 489
 490     i_stride[0] = h->mb.pic.i_stride[1];
 491     i_stride[1] = h->mb.pic.i_stride[2];
 492
 493     predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 494     res->i_sad_i8x8 = COST_MAX;
 495     for( i = 0; i < i_max; i++ )
 496     {
 497         int i_sad;
 498         int i_mode;
 499
 500         i_mode = predict_mode[i];
 501
 502         /* we do the prediction */
 503         h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
 504         h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
 505
 506         /* we calculate the cost */
 507         i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
 508                                          p_srcc[0], i_stride[0] ) +
 509                 h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
 510                                          p_srcc[1], i_stride[1] ) +
 511                 res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
 512
 513         /* if i_score is lower it is better */
 514         if( res->i_sad_i8x8 > i_sad )
 515         {
 516             res->i_predict8x8 = i_mode;
 517             res->i_sad_i8x8     = i_sad;
 518         }
 519     }
 520 }
 521
 522 #define LOAD_FENC( m, src, xoff, yoff) \
 523     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
 524     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
 525     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 526     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 527     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 528 #define LOAD_HPELS(m, src, xoff, yoff) \
 529     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 530     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
 531     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
 532     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
 533     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 534     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 535
 536 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 537 {
 538     x264_me_t m;
 539     int i_ref;
 540     int mvc[4][2], i_mvc;
 541     int i_fullpel_thresh = INT_MAX;
 542     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 543
 544     /* 16x16 Search on all ref frame */
 545     m.i_pixel = PIXEL_16x16;
 546     m.p_cost_mv = a->p_cost_mv;
 547     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 548
 549     a->l0.me16x16.cost = INT_MAX;
 550     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 551     {
 552         const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
 553         i_fullpel_thresh -= i_ref_cost;
 554
 555         /* search with ref */
 556         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 557         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 558         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 559         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 560
 561         m.cost += i_ref_cost;
 562         i_fullpel_thresh += i_ref_cost;
 563
 564         if( m.cost < a->l0.me16x16.cost )
 565         {
 566             a->l0.i_ref = i_ref;
 567             a->l0.me16x16 = m;
 568         }
 569
 570         /* save mv for predicting neighbors */
 571         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 572         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 573     }
 574
 575     /* subtract ref cost, so we don't have to add it for the other P types */
 576     a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
 577
 578     /* Set global ref, needed for all others modes */
 579     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
 580 }
 581
 582 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 583 {
 584     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 585     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 586     int mvc[5][2], i_mvc;
 587     int i;
 588
 589     /* XXX Needed for x264_mb_predict_mv */
 590     h->mb.i_partition = D_8x8;
 591
 592     i_mvc = 1;
 593     mvc[0][0] = a->l0.me16x16.mv[0];
 594     mvc[0][1] = a->l0.me16x16.mv[1];
 595
 596     for( i = 0; i < 4; i++ )
 597     {
 598         x264_me_t *m = &a->l0.me8x8[i];
 599         const int x8 = i%2;
 600         const int y8 = i/2;
 601
 602         m->i_pixel = PIXEL_8x8;
 603         m->p_cost_mv = a->p_cost_mv;
 604
 605         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
 606         LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
 607
 608         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 609         x264_me_search( h, m, mvc, i_mvc );
 610
 611         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 612
 613         mvc[i_mvc][0] = m->mv[0];
 614         mvc[i_mvc][1] = m->mv[1];
 615         i_mvc++;
 616
 617         /* mb type cost */
 618         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
 619     }
 620
 621     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
 622                    a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
 623 }
 624
 625 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 626 {
 627     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 628     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 629     int mvc[2][2];
 630     int i;
 631
 632     /* XXX Needed for x264_mb_predict_mv */
 633     h->mb.i_partition = D_16x8;
 634
 635     for( i = 0; i < 2; i++ )
 636     {
 637         x264_me_t *m = &a->l0.me16x8[i];
 638
 639         m->i_pixel = PIXEL_16x8;
 640         m->p_cost_mv = a->p_cost_mv;
 641
 642         LOAD_FENC( m, p_fenc, 0, 8*i );
 643         LOAD_HPELS( m, p_fref, 0, 8*i );
 644
 645         mvc[0][0] = a->l0.me8x8[2*i].mv[0];
 646         mvc[0][1] = a->l0.me8x8[2*i].mv[1];
 647         mvc[1][0] = a->l0.me8x8[2*i+1].mv[0];
 648         mvc[1][1] = a->l0.me8x8[2*i+1].mv[1];
 649
 650         x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
 651         x264_me_search( h, m, mvc, 2 );
 652
 653         x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
 654     }
 655
 656     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
 657 }
 658
 659 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 660 {
 661     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 662     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 663     int mvc[2][2];
 664     int i;
 665
 666     /* XXX Needed for x264_mb_predict_mv */
 667     h->mb.i_partition = D_8x16;
 668
 669     for( i = 0; i < 2; i++ )
 670     {
 671         x264_me_t *m = &a->l0.me8x16[i];
 672
 673         m->i_pixel = PIXEL_8x16;
 674         m->p_cost_mv = a->p_cost_mv;
 675
 676         LOAD_FENC( m, p_fenc, 8*i, 0 );
 677         LOAD_HPELS( m, p_fref, 8*i, 0 );
 678
 679         mvc[0][0] = a->l0.me8x8[i].mv[0];
 680         mvc[0][1] = a->l0.me8x8[i].mv[1];
 681         mvc[1][0] = a->l0.me8x8[i+2].mv[0];
 682         mvc[1][1] = a->l0.me8x8[i+2].mv[1];
 683
 684         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 685         x264_me_search( h, m, mvc, 2 );
 686
 687         x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
 688     }
 689
 690     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
 691 }
 692
 693 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
 694 {
 695     uint8_t pix1[8*8], pix2[8*8];
 696     const int i_stride = h->mb.pic.i_stride[1];
 697     const int off = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
 698
 699 #define CHROMA4x4MC( width, height, me, x, y ) \
 700     h->mc.mc_chroma( &p_fref[4][off+x+y*i_stride], i_stride, &pix1[x+y*8], 8, (me).mv[0], (me).mv[1], width, height ); \
 701     h->mc.mc_chroma( &p_fref[5][off+x+y*i_stride], i_stride, &pix2[x+y*8], 8, (me).mv[0], (me).mv[1], width, height );
 702
 703     if( pixel == PIXEL_4x4 )
 704     {
 705         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
 706         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 0,2 );
 707         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 2,0 );
 708         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
 709     }
 710     else if( pixel == PIXEL_8x4 )
 711     {
 712         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
 713         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
 714     }
 715     else
 716     {
 717         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
 718         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
 719     }
 720
 721     return h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
 722          + h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
 723 }
 724
 725 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 726 {
 727     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 728     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 729
 730     int i4x4;
 731
 732     /* XXX Needed for x264_mb_predict_mv */
 733     h->mb.i_partition = D_8x8;
 734
 735     for( i4x4 = 0; i4x4 < 4; i4x4++ )
 736     {
 737         const int idx = 4*i8x8 + i4x4;
 738         const int x4 = block_idx_x[idx];
 739         const int y4 = block_idx_y[idx];
 740         const int i_mvc = (i4x4 == 0);
 741
 742         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
 743
 744         m->i_pixel = PIXEL_4x4;
 745         m->p_cost_mv = a->p_cost_mv;
 746
 747         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 748         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 749
 750         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
 751         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
 752
 753         x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
 754     }
 755
 756     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
 757                          a->l0.me4x4[i8x8][1].cost +
 758                          a->l0.me4x4[i8x8][2].cost +
 759                          a->l0.me4x4[i8x8][3].cost +
 760                          a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
 761     if( h->mb.b_chroma_me )
 762         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
 763 }
 764
 765 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 766 {
 767     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 768     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 769
 770     int i8x4;
 771
 772     /* XXX Needed for x264_mb_predict_mv */
 773     h->mb.i_partition = D_8x8;
 774
 775     for( i8x4 = 0; i8x4 < 2; i8x4++ )
 776     {
 777         const int idx = 4*i8x8 + 2*i8x4;
 778         const int x4 = block_idx_x[idx];
 779         const int y4 = block_idx_y[idx];
 780         const int i_mvc = (i8x4 == 0);
 781
 782         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
 783
 784         m->i_pixel = PIXEL_8x4;
 785         m->p_cost_mv = a->p_cost_mv;
 786
 787         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 788         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 789
 790         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
 791         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 792
 793         x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
 794     }
 795
 796     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
 797                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
 798     if( h->mb.b_chroma_me )
 799         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
 800 }
 801
 802 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 803 {
 804     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 805     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 806
 807     int i4x8;
 808
 809     /* XXX Needed for x264_mb_predict_mv */
 810     h->mb.i_partition = D_8x8;
 811
 812     for( i4x8 = 0; i4x8 < 2; i4x8++ )
 813     {
 814         const int idx = 4*i8x8 + i4x8;
 815         const int x4 = block_idx_x[idx];
 816         const int y4 = block_idx_y[idx];
 817         const int i_mvc = (i4x8 == 0);
 818
 819         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
 820
 821         m->i_pixel = PIXEL_4x8;
 822         m->p_cost_mv = a->p_cost_mv;
 823
 824         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 825         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 826
 827         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
 828         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 829
 830         x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
 831     }
 832
 833     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
 834                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
 835     if( h->mb.b_chroma_me )
 836         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
 837 }
 838
 839 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 840 {
 841     /* Assumes that fdec still contains the results of
 842      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
 843
 844     uint8_t **p_fenc = h->mb.pic.p_fenc;
 845     uint8_t **p_fdec = h->mb.pic.p_fdec;
 846     int i_stride= h->mb.pic.i_stride[0];
 847     int i;
 848
 849     a->i_cost16x16direct = 0;
 850     for( i = 0; i < 4; i++ )
 851     {
 852         const int x8 = i%2;
 853         const int y8 = i/2;
 854         const int off = 8 * x8 + 8 * i_stride * y8;
 855         a->i_cost16x16direct +=
 856         a->i_cost8x8direct[i] =
 857             h->pixf.satd[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
 858
 859         /* mb type cost */
 860         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
 861     }
 862
 863     a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
 864 }
 865
 866 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
 867     { \
 868         if( h->param.analyse.b_weighted_bipred ) \
 869             h->pixf.avg_weight[size]( pix1, stride1, src2, stride2, \
 870                     h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
 871         else \
 872             h->pixf.avg[size]( pix1, stride1, src2, stride2 ); \
 873     }
 874
 875 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 876 {
 877     uint8_t pix1[16*16], pix2[16*16];
 878     uint8_t *src2;
 879     int stride2 = 16;
 880     int src2_ref, pix1_ref;
 881
 882     x264_me_t m;
 883     int i_ref;
 884     int mvc[5][2], i_mvc;
 885     int i_fullpel_thresh = INT_MAX;
 886     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 887
 888     /* 16x16 Search on all ref frame */
 889     m.i_pixel = PIXEL_16x16;
 890     m.p_cost_mv = a->p_cost_mv;
 891     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 892
 893     /* ME for List 0 */
 894     a->l0.me16x16.cost = INT_MAX;
 895     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 896     {
 897         /* search with ref */
 898         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 899         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 900         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 901         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 902
 903         /* add ref cost */
 904         m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
 905
 906         if( m.cost < a->l0.me16x16.cost )
 907         {
 908             a->l0.i_ref = i_ref;
 909             a->l0.me16x16 = m;
 910         }
 911
 912         /* save mv for predicting neighbors */
 913         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 914         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 915     }
 916     /* subtract ref cost, so we don't have to add it for the other MB types */
 917     a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
 918
 919     /* ME for list 1 */
 920     i_fullpel_thresh = INT_MAX;
 921     p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL;
 922     a->l1.me16x16.cost = INT_MAX;
 923     for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
 924     {
 925         /* search with ref */
 926         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 );
 927         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
 928         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
 929         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 930
 931         /* add ref cost */
 932         m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
 933
 934         if( m.cost < a->l1.me16x16.cost )
 935         {
 936             a->l1.i_ref = i_ref;
 937             a->l1.me16x16 = m;
 938         }
 939
 940         /* save mv for predicting neighbors */
 941         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 942         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 943     }
 944     /* subtract ref cost, so we don't have to add it for the other MB types */
 945     a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
 946
 947     /* Set global ref, needed for other modes? */
 948     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
 949     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
 950
 951     /* get cost of BI mode */
 952     if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
 953     {
 954         /* l0 reference is halfpel, so get_ref on it will make it faster */
 955         src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
 956                         pix2, &stride2,
 957                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
 958                         16, 16 );
 959         h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
 960                         pix1, 16,
 961                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
 962                         16, 16 );
 963         src2_ref = a->l0.i_ref;
 964         pix1_ref = a->l1.i_ref;
 965     }
 966     else
 967     {
 968         /* if l0 was qpel, we'll use get_ref on l1 instead */
 969         h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
 970                         pix1, 16,
 971                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
 972                         16, 16 );
 973         src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
 974                         pix2, &stride2,
 975                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
 976                         16, 16 );
 977         src2_ref = a->l1.i_ref;
 978         pix1_ref = a->l0.i_ref;
 979     }
 980
 981     if( h->param.analyse.b_weighted_bipred )
 982         h->pixf.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
 983                 h->mb.bipred_weight[pix1_ref][src2_ref] );
 984     else
 985         h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
 986
 987     a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
 988                      + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
 989                                      + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
 990                      + a->l0.me16x16.cost_mv
 991                      + a->l1.me16x16.cost_mv;
 992
 993     /* mb type cost */
 994     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
 995     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
 996     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
 997 }
 998
 999 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1000     if( x264_mb_partition_listX_table[0][part] ) \
1001     { \
1002         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1003         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
1004     } \
1005     else \
1006     { \
1007         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1008         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0, 0 ); \
1009         if( b_mvd ) \
1010             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
1011     } \
1012     if( x264_mb_partition_listX_table[1][part] ) \
1013     { \
1014         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1015         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
1016     } \
1017     else \
1018     { \
1019         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1020         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0, 0 ); \
1021         if( b_mvd ) \
1022             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
1023     }
1024
1025 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1026 {
1027     int x = (i%2)*2;
1028     int y = (i/2)*2;
1029     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1030     {
1031         x264_mb_load_mv_direct8x8( h, i );
1032         if( b_mvd )
1033         {
1034             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0, 0 );
1035             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0, 0 );
1036             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1037         }
1038     }
1039     else
1040     {
1041         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1042     }
1043 }
1044 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1045 {
1046     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1047 }
1048 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1049 {
1050     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1051 }
1052 #undef CACHE_MV_BI
1053
1054 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1055 {
1056     uint8_t **p_fref[2] =
1057         { h->mb.pic.p_fref[0][a->l0.i_ref],
1058           h->mb.pic.p_fref[1][a->l1.i_ref] };
1059     uint8_t pix[2][8*8];
1060     int i, l;
1061
1062     /* XXX Needed for x264_mb_predict_mv */
1063     h->mb.i_partition = D_8x8;
1064
1065     a->i_cost8x8bi = 0;
1066
1067     for( i = 0; i < 4; i++ )
1068     {
1069         const int x8 = i%2;
1070         const int y8 = i/2;
1071         int i_part_cost;
1072         int i_part_cost_bi = 0;
1073
1074         for( l = 0; l < 2; l++ )
1075         {
1076             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1077             x264_me_t *m = &lX->me8x8[i];
1078
1079             m->i_pixel = PIXEL_8x8;
1080             m->p_cost_mv = a->p_cost_mv;
1081
1082             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1083             LOAD_HPELS( m, p_fref[l], 8*x8, 8*y8 );
1084
1085             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1086             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1087
1088             x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
1089
1090             /* BI mode */
1091             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1092                             m->mv[0], m->mv[1], 8, 8 );
1093             i_part_cost_bi += m->cost_mv;
1094             /* FIXME: ref cost */
1095         }
1096
1097         WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
1098         i_part_cost_bi += h->pixf.satd[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
1099                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1100         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1101         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1102
1103         i_part_cost = a->l0.me8x8[i].cost;
1104         h->mb.i_sub_partition[i] = D_L0_8x8;
1105         if( a->l1.me8x8[i].cost < i_part_cost )
1106         {
1107             i_part_cost = a->l1.me8x8[i].cost;
1108             h->mb.i_sub_partition[i] = D_L1_8x8;
1109         }
1110         if( i_part_cost_bi < i_part_cost )
1111         {
1112             i_part_cost = i_part_cost_bi;
1113             h->mb.i_sub_partition[i] = D_BI_8x8;
1114         }
1115         if( a->i_cost8x8direct[i] < i_part_cost )
1116         {
1117             i_part_cost = a->i_cost8x8direct[i];
1118             h->mb.i_sub_partition[i] = D_DIRECT_8x8;
1119         }
1120         a->i_cost8x8bi += i_part_cost;
1121
1122         /* XXX Needed for x264_mb_predict_mv */
1123         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1124     }
1125
1126     /* mb type cost */
1127     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1128 }
1129
1130 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1131 {
1132     uint8_t **p_fref[2] =
1133         { h->mb.pic.p_fref[0][a->l0.i_ref],
1134           h->mb.pic.p_fref[1][a->l1.i_ref] };
1135     uint8_t pix[2][16*8];
1136     int mvc[2][2];
1137     int i, l;
1138
1139     h->mb.i_partition = D_16x8;
1140     a->i_cost16x8bi = 0;
1141
1142     for( i = 0; i < 2; i++ )
1143     {
1144         int i_part_cost;
1145         int i_part_cost_bi = 0;
1146
1147         /* TODO: check only the list(s) that were used in b8x8? */
1148         for( l = 0; l < 2; l++ )
1149         {
1150             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1151             x264_me_t *m = &lX->me16x8[i];
1152
1153             m->i_pixel = PIXEL_16x8;
1154             m->p_cost_mv = a->p_cost_mv;
1155
1156             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1157             LOAD_HPELS( m, p_fref[l], 0, 8*i );
1158
1159             mvc[0][0] = lX->me8x8[2*i].mv[0];
1160             mvc[0][1] = lX->me8x8[2*i].mv[1];
1161             mvc[1][0] = lX->me8x8[2*i+1].mv[0];
1162             mvc[1][1] = lX->me8x8[2*i+1].mv[1];
1163
1164             x264_mb_predict_mv( h, 0, 8*i, 2, m->mvp );
1165             x264_me_search( h, m, mvc, 2 );
1166
1167             /* BI mode */
1168             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
1169                             m->mv[0], m->mv[1], 16, 8 );
1170             /* FIXME: ref cost */
1171             i_part_cost_bi += m->cost_mv;
1172         }
1173
1174         WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
1175         i_part_cost_bi += h->pixf.satd[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
1176
1177         i_part_cost = a->l0.me16x8[i].cost;
1178         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1179         if( a->l1.me16x8[i].cost < i_part_cost )
1180         {
1181             i_part_cost = a->l1.me16x8[i].cost;
1182             a->i_mb_partition16x8[i] = D_L1_8x8;
1183         }
1184         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1185         {
1186             i_part_cost = i_part_cost_bi;
1187             a->i_mb_partition16x8[i] = D_BI_8x8;
1188         }
1189         a->i_cost16x8bi += i_part_cost;
1190
1191         if( i == 0 )
1192             x264_mb_cache_mv_b16x8( h, a, i, 0 );
1193     }
1194
1195     /* mb type cost */
1196     a->i_mb_type16x8 = B_L0_L0
1197         + (a->i_mb_partition16x8[0]>>2) * 3
1198         + (a->i_mb_partition16x8[1]>>2);
1199     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1200 }
1201 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1202 {
1203     uint8_t **p_fref[2] =
1204         { h->mb.pic.p_fref[0][a->l0.i_ref],
1205           h->mb.pic.p_fref[1][a->l1.i_ref] };
1206     uint8_t pix[2][8*16];
1207     int mvc[2][2];
1208     int i, l;
1209
1210     h->mb.i_partition = D_8x16;
1211     a->i_cost8x16bi = 0;
1212
1213     for( i = 0; i < 2; i++ )
1214     {
1215         int i_part_cost;
1216         int i_part_cost_bi = 0;
1217
1218         for( l = 0; l < 2; l++ )
1219         {
1220             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1221             x264_me_t *m = &lX->me8x16[i];
1222
1223             m->i_pixel = PIXEL_8x16;
1224             m->p_cost_mv = a->p_cost_mv;
1225
1226             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1227             LOAD_HPELS( m, p_fref[l], 8*i, 0 );
1228
1229             mvc[0][0] = lX->me8x8[i].mv[0];
1230             mvc[0][1] = lX->me8x8[i].mv[1];
1231             mvc[1][0] = lX->me8x8[i+2].mv[0];
1232             mvc[1][1] = lX->me8x8[i+2].mv[1];
1233
1234             x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1235             x264_me_search( h, m, mvc, 2 );
1236
1237             /* BI mode */
1238             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1239                             m->mv[0], m->mv[1], 8, 16 );
1240             /* FIXME: ref cost */
1241             i_part_cost_bi += m->cost_mv;
1242         }
1243
1244         WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
1245         i_part_cost_bi += h->pixf.satd[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
1246
1247         i_part_cost = a->l0.me8x16[i].cost;
1248         a->i_mb_partition8x16[i] = D_L0_8x8;
1249         if( a->l1.me8x16[i].cost < i_part_cost )
1250         {
1251             i_part_cost = a->l1.me8x16[i].cost;
1252             a->i_mb_partition8x16[i] = D_L1_8x8;
1253         }
1254         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1255         {
1256             i_part_cost = i_part_cost_bi;
1257             a->i_mb_partition8x16[i] = D_BI_8x8;
1258         }
1259         a->i_cost8x16bi += i_part_cost;
1260
1261         if( i == 0 )
1262             x264_mb_cache_mv_b8x16( h, a, i, 0 );
1263     }
1264
1265     /* mb type cost */
1266     a->i_mb_type8x16 = B_L0_L0
1267         + (a->i_mb_partition8x16[0]>>2) * 3
1268         + (a->i_mb_partition8x16[1]>>2);
1269     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1270 }
1271
1272 /*****************************************************************************
1273  * x264_macroblock_analyse:
1274  *****************************************************************************/
1275 void x264_macroblock_analyse( x264_t *h )
1276 {
1277     x264_mb_analysis_t analysis;
1278     int i;
1279
1280     h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp(h);
1281
1282     /* prevent QP from varying too fast. FIXME what's a sane limit? */
1283     h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->mb.qp[h->mb.i_mb_xy],
1284                                           h->mb.i_last_qp - 12, h->mb.i_last_qp + 12 );
1285
1286     /* init analysis */
1287     x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
1288
1289     /*--------------------------- Do the analysis ---------------------------*/
1290     if( h->sh.i_type == SLICE_TYPE_I )
1291     {
1292         x264_mb_analyse_intra( h, &analysis );
1293
1294         if( analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
1295             h->mb.i_type = I_4x4;
1296         else
1297             h->mb.i_type = I_16x16;
1298     }
1299     else if( h->sh.i_type == SLICE_TYPE_P )
1300     {
1301         const unsigned int i_neighbour = h->mb.i_neighbour;
1302
1303         int b_skip = 0;
1304         int i_cost;
1305         int i_intra_cost, i_intra_type;
1306
1307         /* Fast P_SKIP detection */
1308         if( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
1309             ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
1310             ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
1311             ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) )
1312         {
1313             b_skip = x264_macroblock_probe_pskip( h );
1314         }
1315
1316         if( b_skip )
1317         {
1318             h->mb.i_type = P_SKIP;
1319             h->mb.i_partition = D_16x16;
1320         }
1321         else
1322         {
1323             const unsigned int flags = h->param.analyse.inter;
1324             int i_type;
1325             int i_partition;
1326
1327             x264_mb_analyse_load_costs( h, &analysis );
1328
1329             x264_mb_analyse_inter_p16x16( h, &analysis );
1330             if( flags & X264_ANALYSE_PSUB16x16 )
1331                 x264_mb_analyse_inter_p8x8( h, &analysis );
1332
1333             /* Select best inter mode */
1334             i_type = P_L0;
1335             i_partition = D_16x16;
1336             i_cost = analysis.l0.me16x16.cost;
1337
1338             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1339                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
1340             {
1341                 int i;
1342
1343                 i_type = P_8x8;
1344                 i_partition = D_8x8;
1345                 h->mb.i_sub_partition[0] = D_L0_8x8;
1346                 h->mb.i_sub_partition[1] = D_L0_8x8;
1347                 h->mb.i_sub_partition[2] = D_L0_8x8;
1348                 h->mb.i_sub_partition[3] = D_L0_8x8;
1349
1350                 i_cost = analysis.l0.i_cost8x8;
1351
1352                 /* Do sub 8x8 */
1353                 if( flags & X264_ANALYSE_PSUB8x8 )
1354                 {
1355                     for( i = 0; i < 4; i++ )
1356                     {
1357                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
1358                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
1359                         {
1360                             int i_cost8x8;
1361
1362                             h->mb.i_sub_partition[i] = D_L0_4x4;
1363                             i_cost8x8 = analysis.l0.i_cost4x4[i];
1364
1365                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
1366                             if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
1367                             {
1368                                 h->mb.i_sub_partition[i] = D_L0_8x4;
1369                                 i_cost8x8 = analysis.l0.i_cost8x4[i];
1370                             }
1371
1372                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
1373                             if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
1374                             {
1375                                 h->mb.i_sub_partition[i] = D_L0_4x8;
1376                                 i_cost8x8 = analysis.l0.i_cost4x8[i];
1377                             }
1378
1379                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
1380                         }
1381                     }
1382                 }
1383
1384                 /* Now do sub 16x8/8x16 */
1385                 x264_mb_analyse_inter_p16x8( h, &analysis );
1386                 if( analysis.l0.i_cost16x8 < i_cost )
1387                 {
1388                     i_type = P_L0;
1389                     i_partition = D_16x8;
1390                     i_cost = analysis.l0.i_cost16x8;
1391                 }
1392
1393                 x264_mb_analyse_inter_p8x16( h, &analysis );
1394                 if( analysis.l0.i_cost8x16 < i_cost )
1395                 {
1396                     i_type = P_L0;
1397                     i_partition = D_8x16;
1398                     i_cost = analysis.l0.i_cost8x16;
1399                 }
1400             }
1401
1402             h->mb.i_type = i_type;
1403             h->mb.i_partition = i_partition;
1404
1405             /* refine qpel */
1406             if( h->mb.i_partition == D_16x16 )
1407             {
1408                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1409                 i_cost = analysis.l0.me16x16.cost;
1410             }
1411             else if( h->mb.i_partition == D_16x8 )
1412             {
1413                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
1414                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
1415                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
1416             }
1417             else if( h->mb.i_partition == D_8x16 )
1418             {
1419                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
1420                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
1421                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
1422             }
1423             else if( h->mb.i_partition == D_8x8 )
1424             {
1425                 int i8x8;
1426                 i_cost = 0;
1427                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
1428                 {
1429                     switch( h->mb.i_sub_partition[i8x8] )
1430                     {
1431                         case D_L0_8x8:
1432                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
1433                             i_cost += analysis.l0.me8x8[i8x8].cost;
1434                             break;
1435                         case D_L0_8x4:
1436                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
1437                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
1438                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
1439                                       analysis.l0.me8x4[i8x8][1].cost;
1440                             break;
1441                         case D_L0_4x8:
1442                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
1443                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
1444                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
1445                                       analysis.l0.me4x8[i8x8][1].cost;
1446                             break;
1447
1448                         case D_L0_4x4:
1449                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
1450                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
1451                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
1452                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
1453                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
1454                                       analysis.l0.me4x4[i8x8][1].cost +
1455                                       analysis.l0.me4x4[i8x8][2].cost +
1456                                       analysis.l0.me4x4[i8x8][3].cost;
1457                             break;
1458                         default:
1459                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
1460                             break;
1461                     }
1462                 }
1463             }
1464
1465             x264_mb_analyse_intra( h, &analysis );
1466             if( h->mb.b_chroma_me &&
1467                 ( analysis.i_sad_i16x16 < i_cost
1468              || ( analysis.i_sad_i4x4 < i_cost )))
1469             {
1470                 x264_mb_analyse_intra_chroma( h, &analysis );
1471                 analysis.i_sad_i16x16 += analysis.i_sad_i8x8;
1472                 analysis.i_sad_i4x4 += analysis.i_sad_i8x8;
1473             }
1474
1475             i_intra_type = I_16x16;
1476             i_intra_cost = analysis.i_sad_i16x16;
1477
1478             if( analysis.i_sad_i4x4 < i_intra_cost )
1479             {
1480                 i_intra_type = I_4x4;
1481                 i_intra_cost = analysis.i_sad_i4x4;
1482             }
1483
1484             if( i_intra_cost < i_cost )
1485             {
1486                 h->mb.i_type = i_intra_type;
1487                 i_cost = i_intra_cost;
1488             }
1489
1490             h->stat.frame.i_intra_cost += i_intra_cost;
1491             h->stat.frame.i_inter_cost += i_cost;
1492         }
1493     }
1494     else if( h->sh.i_type == SLICE_TYPE_B )
1495     {
1496         int b_skip = 0;
1497
1498         analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h );
1499         if( analysis.b_direct_available )
1500         {
1501             h->mb.i_type = B_SKIP;
1502             x264_mb_mc( h );
1503
1504             /* Conditioning the probe on neighboring block types
1505              * doesn't seem to help speed or quality. */
1506             b_skip = x264_macroblock_probe_bskip( h );
1507         }
1508
1509         if( !b_skip )
1510         {
1511             const unsigned int flags = h->param.analyse.inter;
1512             int i_partition;
1513             int i_cost;
1514
1515             x264_mb_analyse_load_costs( h, &analysis );
1516
1517             /* select best inter mode */
1518             /* direct must be first */
1519             if( analysis.b_direct_available )
1520                 x264_mb_analyse_inter_direct( h, &analysis );
1521
1522             x264_mb_analyse_inter_b16x16( h, &analysis );
1523
1524             h->mb.i_type = B_L0_L0;
1525             i_partition = D_16x16;
1526             i_cost = analysis.l0.me16x16.cost;
1527             if( analysis.l1.me16x16.cost < i_cost )
1528             {
1529                 h->mb.i_type = B_L1_L1;
1530                 i_cost = analysis.l1.me16x16.cost;
1531             }
1532             if( analysis.i_cost16x16bi < i_cost )
1533             {
1534                 h->mb.i_type = B_BI_BI;
1535                 i_cost = analysis.i_cost16x16bi;
1536             }
1537             if( analysis.i_cost16x16direct < i_cost )
1538             {
1539                 h->mb.i_type = B_DIRECT;
1540                 i_cost = analysis.i_cost16x16direct;
1541             }
1542
1543             if( flags & X264_ANALYSE_BSUB16x16 )
1544             {
1545                 x264_mb_analyse_inter_b8x8( h, &analysis );
1546                 if( analysis.i_cost8x8bi < i_cost )
1547                 {
1548                     h->mb.i_type = B_8x8;
1549                     i_partition = D_8x8;
1550                     i_cost = analysis.i_cost8x8bi;
1551
1552                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
1553                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
1554                     {
1555                         x264_mb_analyse_inter_b16x8( h, &analysis );
1556                         if( analysis.i_cost16x8bi < i_cost )
1557                         {
1558                             i_partition = D_16x8;
1559                             i_cost = analysis.i_cost16x8bi;
1560                             h->mb.i_type = analysis.i_mb_type16x8;
1561                         }
1562                     }
1563                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
1564                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
1565                     {
1566                         x264_mb_analyse_inter_b8x16( h, &analysis );
1567                         if( analysis.i_cost8x16bi < i_cost )
1568                         {
1569                             i_partition = D_8x16;
1570                             i_cost = analysis.i_cost8x16bi;
1571                             h->mb.i_type = analysis.i_mb_type8x16;
1572                         }
1573                     }
1574                 }
1575             }
1576
1577             h->mb.i_partition = i_partition;
1578
1579             /* refine qpel */
1580             if( i_partition == D_16x16 )
1581             {
1582                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1583                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1584                 if( h->mb.i_type == B_L0_L0 )
1585                 {
1586                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1587                     i_cost = analysis.l0.me16x16.cost
1588                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1589                 }
1590                 else if( h->mb.i_type == B_L1_L1 )
1591                 {
1592                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1593                     i_cost = analysis.l1.me16x16.cost
1594                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1595                 }
1596                 else if( h->mb.i_type == B_BI_BI )
1597                 {
1598                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1599                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1600                 }
1601             }
1602             else if( i_partition == D_16x8 )
1603             {
1604                 for( i=0; i<2; i++ )
1605                 {
1606                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
1607                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
1608                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
1609                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
1610                 }
1611             }
1612             else if( i_partition == D_8x16 )
1613             {
1614                 for( i=0; i<2; i++ )
1615                 {
1616                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
1617                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
1618                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
1619                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
1620                 }
1621             }
1622             else if( i_partition == D_8x8 )
1623             {
1624                 for( i=0; i<4; i++ )
1625                 {
1626                     x264_me_t *m;
1627                     int i_part_cost_old;
1628                     int i_type_cost;
1629                     int i_part_type = h->mb.i_sub_partition[i];
1630                     int b_bidir = (i_part_type == D_BI_8x8);
1631
1632                     if( i_part_type == D_DIRECT_8x8 )
1633                         continue;
1634                     if( x264_mb_partition_listX_table[0][i_part_type] )
1635                     {
1636                         m = &analysis.l0.me8x8[i];
1637                         i_part_cost_old = m->cost;
1638                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1639                         m->cost -= i_type_cost;
1640                         x264_me_refine_qpel( h, m );
1641                         if( !b_bidir )
1642                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1643                     }
1644                     if( x264_mb_partition_listX_table[1][i_part_type] )
1645                     {
1646                         m = &analysis.l1.me8x8[i];
1647                         i_part_cost_old = m->cost;
1648                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1649                         m->cost -= i_type_cost;
1650                         x264_me_refine_qpel( h, m );
1651                         if( !b_bidir )
1652                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1653                     }
1654                     /* TODO: update mvp? */
1655                 }
1656             }
1657
1658             /* best intra mode */
1659             x264_mb_analyse_intra( h, &analysis );
1660
1661             if( analysis.i_sad_i16x16 < i_cost )
1662             {
1663                 h->mb.i_type = I_16x16;
1664                 i_cost = analysis.i_sad_i16x16;
1665             }
1666             if( analysis.i_sad_i4x4 < i_cost )
1667             {
1668                 h->mb.i_type = I_4x4;
1669                 i_cost = analysis.i_sad_i4x4;
1670             }
1671         }
1672     }
1673
1674     /*-------------------- Update MB from the analysis ----------------------*/
1675     h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
1676     switch( h->mb.i_type )
1677     {
1678         case I_4x4:
1679             for( i = 0; i < 16; i++ )
1680             {
1681                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
1682                     analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
1683             }
1684
1685             x264_mb_analyse_intra_chroma( h, &analysis );
1686             h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1687             break;
1688         case I_16x16:
1689             h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
1690
1691             x264_mb_analyse_intra_chroma( h, &analysis );
1692             h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1693             break;
1694
1695         case P_L0:
1696             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1697             switch( h->mb.i_partition )
1698             {
1699                 case D_16x16:
1700                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1701                     break;
1702
1703                 case D_16x8:
1704                     x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
1705                     x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
1706                     break;
1707
1708                 case D_8x16:
1709                     x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
1710                     x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
1711                     break;
1712
1713                 default:
1714                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
1715                     break;
1716             }
1717             break;
1718
1719         case P_8x8:
1720             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1721             for( i = 0; i < 4; i++ )
1722             {
1723                 const int x = 2*(i%2);
1724                 const int y = 2*(i/2);
1725
1726                 switch( h->mb.i_sub_partition[i] )
1727                 {
1728                     case D_L0_8x8:
1729                         x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
1730                         break;
1731                     case D_L0_8x4:
1732                         x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
1733                         x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
1734                         break;
1735                     case D_L0_4x8:
1736                         x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
1737                         x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
1738                         break;
1739                     case D_L0_4x4:
1740                         x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
1741                         x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
1742                         x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
1743                         x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
1744                         break;
1745                     default:
1746                         x264_log( h, X264_LOG_ERROR, "internal error\n" );
1747                         break;
1748                 }
1749             }
1750             break;
1751
1752         case P_SKIP:
1753         {
1754             int mvp[2];
1755             x264_mb_predict_mv_pskip( h, mvp );
1756             /* */
1757             h->mb.i_partition = D_16x16;
1758             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
1759             x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
1760             break;
1761         }
1762
1763         case B_SKIP:
1764             /* nothing has changed since x264_macroblock_probe_bskip */
1765             break;
1766         case B_DIRECT:
1767             x264_mb_load_mv_direct8x8( h, 0 );
1768             x264_mb_load_mv_direct8x8( h, 1 );
1769             x264_mb_load_mv_direct8x8( h, 2 );
1770             x264_mb_load_mv_direct8x8( h, 3 );
1771             break;
1772
1773         case B_8x8:
1774             /* optimize: cache might not need to be rewritten */
1775             for( i = 0; i < 4; i++ )
1776                 x264_mb_cache_mv_b8x8( h, &analysis, i, 1 );
1777             break;
1778
1779         default: /* the rest of the B types */
1780             switch( h->mb.i_partition )
1781             {
1782             case D_16x16:
1783                 switch( h->mb.i_type )
1784                 {
1785                 case B_L0_L0:
1786                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1787                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1788
1789                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
1790                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
1791                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
1792                     break;
1793                 case B_L1_L1:
1794                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
1795                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
1796                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
1797
1798                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1799                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1800                     break;
1801                 case B_BI_BI:
1802                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1803                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1804
1805                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1806                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1807                     break;
1808                 }
1809                 break;
1810             case D_16x8:
1811                 x264_mb_cache_mv_b16x8( h, &analysis, 0, 1 );
1812                 x264_mb_cache_mv_b16x8( h, &analysis, 1, 1 );
1813                 break;
1814             case D_8x16:
1815                 x264_mb_cache_mv_b8x16( h, &analysis, 0, 1 );
1816                 x264_mb_cache_mv_b8x16( h, &analysis, 1, 1 );
1817                 break;
1818             default:
1819                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
1820                 break;
1821             }
1822     }
1823 }
1824
1825 #include "slicetype_decision.c"
1826