git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27 #include <math.h>
  28 #include <limits.h>
  29
  30 #include "common/common.h"
  31 #include "common/macroblock.h"
  32 #include "macroblock.h"
  33 #include "me.h"
  34 #include "ratecontrol.h"
  35
  36 typedef struct
  37 {
  38     /* 16x16 */
  39     int i_ref;
  40     x264_me_t me16x16;
  41
  42     /* 8x8 */
  43     int       i_cost8x8;
  44     x264_me_t me8x8[4];
  45
  46     /* Sub 4x4 */
  47     int       i_cost4x4[4]; /* cost per 8x8 partition */
  48     x264_me_t me4x4[4][4];
  49
  50     /* Sub 8x4 */
  51     int       i_cost8x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me8x4[4][2];
  53
  54     /* Sub 4x8 */
  55     int       i_cost4x8[4]; /* cost per 8x8 partition */
  56     x264_me_t me4x8[4][4];
  57
  58     /* 16x8 */
  59     int       i_cost16x8;
  60     x264_me_t me16x8[2];
  61
  62     /* 8x16 */
  63     int       i_cost8x16;
  64     x264_me_t me8x16[2];
  65
  66 } x264_mb_analysis_list_t;
  67
  68 typedef struct
  69 {
  70     /* conduct the analysis using this lamda and QP */
  71     int i_lambda;
  72     int i_qp;
  73     int16_t *p_cost_mv;
  74
  75
  76     /* I: Intra part */
  77     /* Take some shortcuts in intra search if intra is deemed unlikely */
  78     int b_fast_intra;
  79
  80     /* Luma part 16x16 and 4x4 modes stats */
  81     int i_sad_i16x16;
  82     int i_predict16x16;
  83
  84     int i_sad_i4x4;
  85     int i_predict4x4[4][4];
  86
  87     /* Chroma part */
  88     int i_sad_i8x8;
  89     int i_predict8x8;
  90
  91     /* II: Inter part P/B frame */
  92     x264_mb_analysis_list_t l0;
  93     x264_mb_analysis_list_t l1;
  94
  95     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
  96     int i_cost16x16direct;
  97     int i_cost8x8bi;
  98     int i_cost8x8direct[4];
  99     int i_cost16x8bi;
 100     int i_cost8x16bi;
 101
 102     int i_mb_partition16x8[2]; /* mb_partition_e */
 103     int i_mb_partition8x16[2];
 104     int i_mb_type16x8; /* mb_class_e */
 105     int i_mb_type8x16;
 106
 107     int b_direct_available;
 108
 109 } x264_mb_analysis_t;
 110
 111 static const int i_qp0_cost_table[52] = {
 112    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 113    1, 1, 1, 1,              /*  8-11 */
 114    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 115    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 116    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 117   16,18,20,23,25,29,32,36,  /* 36-43 */
 118   40,45,51,57,64,72,81,91   /* 44-51 */
 119 };
 120
 121 static const uint8_t block_idx_x[16] = {
 122     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
 123 };
 124 static const uint8_t block_idx_y[16] = {
 125     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
 126 };
 127
 128 /* TODO: calculate CABAC costs */
 129 static const int i_mb_b_cost_table[18] = {
 130     9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 131 };
 132 static const int i_mb_b16x8_cost_table[16] = {
 133     0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 134 };
 135 static const int i_sub_mb_b_cost_table[13] = {
 136     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 137 };
 138 static const int i_sub_mb_p_cost_table[4] = {
 139     5, 3, 3, 1
 140 };
 141
 142 /* initialize an array of lambda*nbits for all possible mvs */
 143 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 144 {
 145     static int16_t *p_cost_mv[52];
 146
 147     if( !p_cost_mv[a->i_qp] )
 148     {
 149         /* could be faster, but isn't called many times */
 150         /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 151         int i;
 152         p_cost_mv[a->i_qp] = x264_malloc( (4*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) );
 153         p_cost_mv[a->i_qp] += 2*4*h->param.analyse.i_mv_range;
 154         for( i = 0; i <= 2*4*h->param.analyse.i_mv_range; i++ )
 155         {
 156             p_cost_mv[a->i_qp][-i] =
 157             p_cost_mv[a->i_qp][i]  = a->i_lambda * bs_size_se( i );
 158         }
 159     }
 160
 161     a->p_cost_mv = p_cost_mv[a->i_qp];
 162 }
 163
 164 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 165 {
 166     memset( a, 0, sizeof( x264_mb_analysis_t ) );
 167
 168     /* conduct the analysis using this lamda and QP */
 169     a->i_qp = i_qp;
 170     a->i_lambda = i_qp0_cost_table[i_qp];
 171
 172     h->mb.i_me_method = h->param.analyse.i_me_method;
 173     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 174     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 175                         && h->mb.i_subpel_refine >= 5;
 176     a->b_fast_intra = 0;
 177
 178     /* I: Intra part */
 179     a->i_sad_i16x16 =
 180     a->i_sad_i4x4   =
 181     a->i_sad_i8x8   = COST_MAX;
 182
 183     /* II: Inter part P/B frame */
 184     if( h->sh.i_type != SLICE_TYPE_I )
 185     {
 186         int i;
 187         int i_fmv_range = h->param.analyse.i_mv_range - 16;
 188
 189         /* Calculate max allowed MV range */
 190 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
 191         h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
 192         h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 );
 193         h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
 194         h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
 195         if( h->mb.i_mb_x == 0)
 196         {
 197             h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
 198             h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 );
 199             h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
 200             h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
 201         }
 202 #undef CLIP_FMV
 203
 204         a->l0.me16x16.cost =
 205         a->l0.i_cost8x8    = COST_MAX;
 206
 207         for( i = 0; i < 4; i++ )
 208         {
 209             a->l0.i_cost4x4[i] =
 210             a->l0.i_cost8x4[i] =
 211             a->l0.i_cost4x8[i] = COST_MAX;
 212         }
 213
 214         a->l0.i_cost16x8   =
 215         a->l0.i_cost8x16   = COST_MAX;
 216         if( h->sh.i_type == SLICE_TYPE_B )
 217         {
 218             a->l1.me16x16.cost =
 219             a->l1.i_cost8x8    = COST_MAX;
 220
 221             for( i = 0; i < 4; i++ )
 222             {
 223                 a->l1.i_cost4x4[i] =
 224                 a->l1.i_cost8x4[i] =
 225                 a->l1.i_cost4x8[i] =
 226                 a->i_cost8x8direct[i] = COST_MAX;
 227             }
 228
 229             a->l1.i_cost16x8   =
 230             a->l1.i_cost8x16   =
 231
 232             a->i_cost16x16bi   =
 233             a->i_cost16x16direct =
 234             a->i_cost8x8bi     =
 235             a->i_cost16x8bi    =
 236             a->i_cost8x16bi    = COST_MAX;
 237         }
 238
 239         /* Fast intra decision */
 240         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 241         {
 242             const unsigned int i_neighbour = h->mb.i_neighbour;
 243             if(   ((i_neighbour&MB_LEFT) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - 1] ))
 244                || ((i_neighbour&MB_TOP) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] ))
 245                || (((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT)) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] ))
 246                || ((i_neighbour&MB_TOPRIGHT) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] ))
 247                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 248                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_16x16])) )
 249             { /* intra is likely */ }
 250             else
 251             {
 252                 a->b_fast_intra = 1;
 253             }
 254         }
 255     }
 256 }
 257
 258
 259
 260 /*
 261  * Handle intra mb
 262  */
 263 /* Max = 4 */
 264 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 265 {
 266     if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
 267     {
 268         /* top and left avaible */
 269         *mode++ = I_PRED_16x16_V;
 270         *mode++ = I_PRED_16x16_H;
 271         *mode++ = I_PRED_16x16_DC;
 272         *mode++ = I_PRED_16x16_P;
 273         *pi_count = 4;
 274     }
 275     else if( ( i_neighbour & MB_LEFT ) )
 276     {
 277         /* left available*/
 278         *mode++ = I_PRED_16x16_DC_LEFT;
 279         *mode++ = I_PRED_16x16_H;
 280         *pi_count = 2;
 281     }
 282     else if( ( i_neighbour & MB_TOP ) )
 283     {
 284         /* top available*/
 285         *mode++ = I_PRED_16x16_DC_TOP;
 286         *mode++ = I_PRED_16x16_V;
 287         *pi_count = 2;
 288     }
 289     else
 290     {
 291         /* none avaible */
 292         *mode = I_PRED_16x16_DC_128;
 293         *pi_count = 1;
 294     }
 295 }
 296
 297 /* Max = 4 */
 298 static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 299 {
 300     if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
 301     {
 302         /* top and left avaible */
 303         *mode++ = I_PRED_CHROMA_V;
 304         *mode++ = I_PRED_CHROMA_H;
 305         *mode++ = I_PRED_CHROMA_DC;
 306         *mode++ = I_PRED_CHROMA_P;
 307         *pi_count = 4;
 308     }
 309     else if( ( i_neighbour & MB_LEFT ) )
 310     {
 311         /* left available*/
 312         *mode++ = I_PRED_CHROMA_DC_LEFT;
 313         *mode++ = I_PRED_CHROMA_H;
 314         *pi_count = 2;
 315     }
 316     else if( ( i_neighbour & MB_TOP ) )
 317     {
 318         /* top available*/
 319         *mode++ = I_PRED_CHROMA_DC_TOP;
 320         *mode++ = I_PRED_CHROMA_V;
 321         *pi_count = 2;
 322     }
 323     else
 324     {
 325         /* none avaible */
 326         *mode = I_PRED_CHROMA_DC_128;
 327         *pi_count = 1;
 328     }
 329 }
 330
 331 /* MAX = 8 */
 332 static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
 333 {
 334     int b_a, b_b, b_c;
 335     static const unsigned int needmb[16] =
 336     {
 337         MB_LEFT|MB_TOP, MB_TOP,
 338         MB_LEFT,        MB_PRIVATE,
 339         MB_TOP,         MB_TOP|MB_TOPRIGHT,
 340         0,              MB_PRIVATE,
 341         MB_LEFT,        0,
 342         MB_LEFT,        MB_PRIVATE,
 343         0,              MB_PRIVATE,
 344         0,              MB_PRIVATE
 345     };
 346
 347     /* FIXME even when b_c == 0 there is some case where missing pixels
 348      * are emulated and thus more mode are available TODO
 349      * analysis and encode should be fixed too */
 350     b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
 351     b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
 352     b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
 353
 354     if( b_a && b_b )
 355     {
 356         *mode++ = I_PRED_4x4_DC;
 357         *mode++ = I_PRED_4x4_H;
 358         *mode++ = I_PRED_4x4_V;
 359         *mode++ = I_PRED_4x4_DDR;
 360         *mode++ = I_PRED_4x4_VR;
 361         *mode++ = I_PRED_4x4_HD;
 362         *mode++ = I_PRED_4x4_HU;
 363
 364         *pi_count = 7;
 365
 366         if( b_c )
 367         {
 368             *mode++ = I_PRED_4x4_DDL;
 369             *mode++ = I_PRED_4x4_VL;
 370             (*pi_count) += 2;
 371         }
 372     }
 373     else if( b_a && !b_b )
 374     {
 375         *mode++ = I_PRED_4x4_DC_LEFT;
 376         *mode++ = I_PRED_4x4_H;
 377         *mode++ = I_PRED_4x4_HU;
 378         *pi_count = 3;
 379     }
 380     else if( !b_a && b_b )
 381     {
 382         *mode++ = I_PRED_4x4_DC_TOP;
 383         *mode++ = I_PRED_4x4_V;
 384         *pi_count = 2;
 385     }
 386     else
 387     {
 388         *mode++ = I_PRED_4x4_DC_128;
 389         *pi_count = 1;
 390     }
 391 }
 392
 393 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter )
 394 {
 395     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 396     const int i_stride = h->mb.pic.i_stride[0];
 397     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 398     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 399
 400     int i, idx;
 401
 402     int i_max;
 403     int predict_mode[9];
 404
 405     /*---------------- Try all mode and calculate their score ---------------*/
 406
 407     /* 16x16 prediction selection */
 408     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 409     for( i = 0; i < i_max; i++ )
 410     {
 411         int i_sad;
 412         int i_mode;
 413
 414         i_mode = predict_mode[i];
 415
 416         /* we do the prediction */
 417         h->predict_16x16[i_mode]( p_dst, i_stride );
 418
 419         /* we calculate the diff and get the square sum of the diff */
 420         i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
 421                 res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 422         /* if i_score is lower it is better */
 423         if( res->i_sad_i16x16 > i_sad )
 424         {
 425             res->i_predict16x16 = i_mode;
 426             res->i_sad_i16x16     = i_sad;
 427         }
 428     }
 429     /* cavlc mb type prefix */
 430     if( h->sh.i_type == SLICE_TYPE_B )
 431         res->i_sad_i16x16 += res->i_lambda * i_mb_b_cost_table[I_16x16];
 432
 433     if( res->b_fast_intra )
 434     {
 435         if( res->i_sad_i16x16 > 2*i_cost_inter )
 436             return;
 437     }
 438
 439     /* 4x4 prediction selection */
 440     if( flags & X264_ANALYSE_I4x4 )
 441     {
 442         res->i_sad_i4x4 = 0;
 443         for( idx = 0; idx < 16; idx++ )
 444         {
 445             uint8_t *p_src_by;
 446             uint8_t *p_dst_by;
 447             int     i_best;
 448             int x, y;
 449             int i_pred_mode;
 450
 451             i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
 452             x = block_idx_x[idx];
 453             y = block_idx_y[idx];
 454
 455             p_src_by = p_src + 4 * x + 4 * y * i_stride;
 456             p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
 457
 458             i_best = COST_MAX;
 459             predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
 460             for( i = 0; i < i_max; i++ )
 461             {
 462                 int i_sad;
 463                 int i_mode;
 464
 465                 i_mode = predict_mode[i];
 466
 467                 /* we do the prediction */
 468                 h->predict_4x4[i_mode]( p_dst_by, i_stride );
 469
 470                 /* we calculate diff and get the square sum of the diff */
 471                 i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
 472                                                  p_src_by, i_stride );
 473
 474                 i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
 475
 476                 /* if i_score is lower it is better */
 477                 if( i_best > i_sad )
 478                 {
 479                     res->i_predict4x4[x][y] = i_mode;
 480                     i_best = i_sad;
 481                 }
 482             }
 483             res->i_sad_i4x4 += i_best;
 484
 485             /* we need to encode this mb now (for next ones) */
 486             h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
 487             x264_mb_encode_i4x4( h, idx, res->i_qp );
 488
 489             /* we need to store the 'fixed' version */
 490             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
 491                 x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
 492         }
 493         res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
 494         if( h->sh.i_type == SLICE_TYPE_B )
 495             res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
 496     }
 497 }
 498
 499 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
 500 {
 501     int i;
 502
 503     int i_max;
 504     int predict_mode[9];
 505
 506     uint8_t *p_dstc[2], *p_srcc[2];
 507     int      i_stride[2];
 508
 509     if( res->i_sad_i8x8 < COST_MAX )
 510         return;
 511
 512     /* 8x8 prediction selection for chroma */
 513     p_dstc[0] = h->mb.pic.p_fdec[1];
 514     p_dstc[1] = h->mb.pic.p_fdec[2];
 515     p_srcc[0] = h->mb.pic.p_fenc[1];
 516     p_srcc[1] = h->mb.pic.p_fenc[2];
 517
 518     i_stride[0] = h->mb.pic.i_stride[1];
 519     i_stride[1] = h->mb.pic.i_stride[2];
 520
 521     predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 522     res->i_sad_i8x8 = COST_MAX;
 523     for( i = 0; i < i_max; i++ )
 524     {
 525         int i_sad;
 526         int i_mode;
 527
 528         i_mode = predict_mode[i];
 529
 530         /* we do the prediction */
 531         h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
 532         h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
 533
 534         /* we calculate the cost */
 535         i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
 536                                          p_srcc[0], i_stride[0] ) +
 537                 h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
 538                                          p_srcc[1], i_stride[1] ) +
 539                 res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
 540
 541         /* if i_score is lower it is better */
 542         if( res->i_sad_i8x8 > i_sad )
 543         {
 544             res->i_predict8x8 = i_mode;
 545             res->i_sad_i8x8     = i_sad;
 546         }
 547     }
 548 }
 549
 550 #define LOAD_FENC( m, src, xoff, yoff) \
 551     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
 552     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
 553     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 554     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 555     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 556 #define LOAD_HPELS(m, src, xoff, yoff) \
 557     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 558     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
 559     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
 560     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
 561     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 562     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 563
 564 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 565 {
 566     x264_me_t m;
 567     int i_ref;
 568     int mvc[4][2], i_mvc;
 569     int i_fullpel_thresh = INT_MAX;
 570     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 571
 572     /* 16x16 Search on all ref frame */
 573     m.i_pixel = PIXEL_16x16;
 574     m.p_cost_mv = a->p_cost_mv;
 575     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 576
 577     a->l0.me16x16.cost = INT_MAX;
 578     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 579     {
 580         const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
 581         i_fullpel_thresh -= i_ref_cost;
 582
 583         /* search with ref */
 584         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 585         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 586         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 587         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 588
 589         m.cost += i_ref_cost;
 590         i_fullpel_thresh += i_ref_cost;
 591
 592         if( m.cost < a->l0.me16x16.cost )
 593         {
 594             a->l0.i_ref = i_ref;
 595             a->l0.me16x16 = m;
 596         }
 597
 598         /* save mv for predicting neighbors */
 599         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 600         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 601     }
 602
 603     /* subtract ref cost, so we don't have to add it for the other P types */
 604     a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
 605
 606     /* Set global ref, needed for all others modes */
 607     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
 608 }
 609
 610 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 611 {
 612     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 613     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 614     int mvc[5][2], i_mvc;
 615     int i;
 616
 617     /* XXX Needed for x264_mb_predict_mv */
 618     h->mb.i_partition = D_8x8;
 619
 620     i_mvc = 1;
 621     mvc[0][0] = a->l0.me16x16.mv[0];
 622     mvc[0][1] = a->l0.me16x16.mv[1];
 623
 624     for( i = 0; i < 4; i++ )
 625     {
 626         x264_me_t *m = &a->l0.me8x8[i];
 627         const int x8 = i%2;
 628         const int y8 = i/2;
 629
 630         m->i_pixel = PIXEL_8x8;
 631         m->p_cost_mv = a->p_cost_mv;
 632
 633         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
 634         LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
 635
 636         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 637         x264_me_search( h, m, mvc, i_mvc );
 638
 639         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 640
 641         mvc[i_mvc][0] = m->mv[0];
 642         mvc[i_mvc][1] = m->mv[1];
 643         i_mvc++;
 644
 645         /* mb type cost */
 646         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
 647     }
 648
 649     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
 650                    a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
 651 }
 652
 653 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 654 {
 655     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 656     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 657     int mvc[2][2];
 658     int i;
 659
 660     /* XXX Needed for x264_mb_predict_mv */
 661     h->mb.i_partition = D_16x8;
 662
 663     for( i = 0; i < 2; i++ )
 664     {
 665         x264_me_t *m = &a->l0.me16x8[i];
 666
 667         m->i_pixel = PIXEL_16x8;
 668         m->p_cost_mv = a->p_cost_mv;
 669
 670         LOAD_FENC( m, p_fenc, 0, 8*i );
 671         LOAD_HPELS( m, p_fref, 0, 8*i );
 672
 673         mvc[0][0] = a->l0.me8x8[2*i].mv[0];
 674         mvc[0][1] = a->l0.me8x8[2*i].mv[1];
 675         mvc[1][0] = a->l0.me8x8[2*i+1].mv[0];
 676         mvc[1][1] = a->l0.me8x8[2*i+1].mv[1];
 677
 678         x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
 679         x264_me_search( h, m, mvc, 2 );
 680
 681         x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
 682     }
 683
 684     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
 685 }
 686
 687 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 688 {
 689     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 690     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 691     int mvc[2][2];
 692     int i;
 693
 694     /* XXX Needed for x264_mb_predict_mv */
 695     h->mb.i_partition = D_8x16;
 696
 697     for( i = 0; i < 2; i++ )
 698     {
 699         x264_me_t *m = &a->l0.me8x16[i];
 700
 701         m->i_pixel = PIXEL_8x16;
 702         m->p_cost_mv = a->p_cost_mv;
 703
 704         LOAD_FENC( m, p_fenc, 8*i, 0 );
 705         LOAD_HPELS( m, p_fref, 8*i, 0 );
 706
 707         mvc[0][0] = a->l0.me8x8[i].mv[0];
 708         mvc[0][1] = a->l0.me8x8[i].mv[1];
 709         mvc[1][0] = a->l0.me8x8[i+2].mv[0];
 710         mvc[1][1] = a->l0.me8x8[i+2].mv[1];
 711
 712         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 713         x264_me_search( h, m, mvc, 2 );
 714
 715         x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
 716     }
 717
 718     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
 719 }
 720
 721 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
 722 {
 723     uint8_t pix1[8*8], pix2[8*8];
 724     const int i_stride = h->mb.pic.i_stride[1];
 725     const int off = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
 726
 727 #define CHROMA4x4MC( width, height, me, x, y ) \
 728     h->mc.mc_chroma( &p_fref[4][off+x+y*i_stride], i_stride, &pix1[x+y*8], 8, (me).mv[0], (me).mv[1], width, height ); \
 729     h->mc.mc_chroma( &p_fref[5][off+x+y*i_stride], i_stride, &pix2[x+y*8], 8, (me).mv[0], (me).mv[1], width, height );
 730
 731     if( pixel == PIXEL_4x4 )
 732     {
 733         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
 734         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 0,2 );
 735         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 2,0 );
 736         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
 737     }
 738     else if( pixel == PIXEL_8x4 )
 739     {
 740         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
 741         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
 742     }
 743     else
 744     {
 745         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
 746         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
 747     }
 748
 749     return h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
 750          + h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
 751 }
 752
 753 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 754 {
 755     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 756     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 757
 758     int i4x4;
 759
 760     /* XXX Needed for x264_mb_predict_mv */
 761     h->mb.i_partition = D_8x8;
 762
 763     for( i4x4 = 0; i4x4 < 4; i4x4++ )
 764     {
 765         const int idx = 4*i8x8 + i4x4;
 766         const int x4 = block_idx_x[idx];
 767         const int y4 = block_idx_y[idx];
 768         const int i_mvc = (i4x4 == 0);
 769
 770         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
 771
 772         m->i_pixel = PIXEL_4x4;
 773         m->p_cost_mv = a->p_cost_mv;
 774
 775         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 776         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 777
 778         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
 779         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
 780
 781         x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
 782     }
 783
 784     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
 785                          a->l0.me4x4[i8x8][1].cost +
 786                          a->l0.me4x4[i8x8][2].cost +
 787                          a->l0.me4x4[i8x8][3].cost +
 788                          a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
 789     if( h->mb.b_chroma_me )
 790         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
 791 }
 792
 793 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 794 {
 795     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 796     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 797
 798     int i8x4;
 799
 800     /* XXX Needed for x264_mb_predict_mv */
 801     h->mb.i_partition = D_8x8;
 802
 803     for( i8x4 = 0; i8x4 < 2; i8x4++ )
 804     {
 805         const int idx = 4*i8x8 + 2*i8x4;
 806         const int x4 = block_idx_x[idx];
 807         const int y4 = block_idx_y[idx];
 808         const int i_mvc = (i8x4 == 0);
 809
 810         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
 811
 812         m->i_pixel = PIXEL_8x4;
 813         m->p_cost_mv = a->p_cost_mv;
 814
 815         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 816         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 817
 818         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
 819         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 820
 821         x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
 822     }
 823
 824     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
 825                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
 826     if( h->mb.b_chroma_me )
 827         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
 828 }
 829
 830 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 831 {
 832     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 833     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 834
 835     int i4x8;
 836
 837     /* XXX Needed for x264_mb_predict_mv */
 838     h->mb.i_partition = D_8x8;
 839
 840     for( i4x8 = 0; i4x8 < 2; i4x8++ )
 841     {
 842         const int idx = 4*i8x8 + i4x8;
 843         const int x4 = block_idx_x[idx];
 844         const int y4 = block_idx_y[idx];
 845         const int i_mvc = (i4x8 == 0);
 846
 847         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
 848
 849         m->i_pixel = PIXEL_4x8;
 850         m->p_cost_mv = a->p_cost_mv;
 851
 852         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 853         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 854
 855         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
 856         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 857
 858         x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
 859     }
 860
 861     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
 862                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
 863     if( h->mb.b_chroma_me )
 864         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
 865 }
 866
 867 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 868 {
 869     /* Assumes that fdec still contains the results of
 870      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
 871
 872     uint8_t **p_fenc = h->mb.pic.p_fenc;
 873     uint8_t **p_fdec = h->mb.pic.p_fdec;
 874     int i_stride= h->mb.pic.i_stride[0];
 875     int i;
 876
 877     a->i_cost16x16direct = 0;
 878     for( i = 0; i < 4; i++ )
 879     {
 880         const int x8 = i%2;
 881         const int y8 = i/2;
 882         const int off = 8 * x8 + 8 * i_stride * y8;
 883         a->i_cost16x16direct +=
 884         a->i_cost8x8direct[i] =
 885             h->pixf.satd[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
 886
 887         /* mb type cost */
 888         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
 889     }
 890
 891     a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
 892 }
 893
 894 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
 895     { \
 896         if( h->param.analyse.b_weighted_bipred ) \
 897             h->pixf.avg_weight[size]( pix1, stride1, src2, stride2, \
 898                     h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
 899         else \
 900             h->pixf.avg[size]( pix1, stride1, src2, stride2 ); \
 901     }
 902
 903 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 904 {
 905     uint8_t pix1[16*16], pix2[16*16];
 906     uint8_t *src2;
 907     int stride2 = 16;
 908     int src2_ref, pix1_ref;
 909
 910     x264_me_t m;
 911     int i_ref;
 912     int mvc[5][2], i_mvc;
 913     int i_fullpel_thresh = INT_MAX;
 914     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 915
 916     /* 16x16 Search on all ref frame */
 917     m.i_pixel = PIXEL_16x16;
 918     m.p_cost_mv = a->p_cost_mv;
 919     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 920
 921     /* ME for List 0 */
 922     a->l0.me16x16.cost = INT_MAX;
 923     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 924     {
 925         /* search with ref */
 926         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 927         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 928         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 929         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 930
 931         /* add ref cost */
 932         m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
 933
 934         if( m.cost < a->l0.me16x16.cost )
 935         {
 936             a->l0.i_ref = i_ref;
 937             a->l0.me16x16 = m;
 938         }
 939
 940         /* save mv for predicting neighbors */
 941         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 942         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 943     }
 944     /* subtract ref cost, so we don't have to add it for the other MB types */
 945     a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
 946
 947     /* ME for list 1 */
 948     i_fullpel_thresh = INT_MAX;
 949     p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL;
 950     a->l1.me16x16.cost = INT_MAX;
 951     for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
 952     {
 953         /* search with ref */
 954         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 );
 955         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
 956         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
 957         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 958
 959         /* add ref cost */
 960         m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
 961
 962         if( m.cost < a->l1.me16x16.cost )
 963         {
 964             a->l1.i_ref = i_ref;
 965             a->l1.me16x16 = m;
 966         }
 967
 968         /* save mv for predicting neighbors */
 969         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 970         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 971     }
 972     /* subtract ref cost, so we don't have to add it for the other MB types */
 973     a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
 974
 975     /* Set global ref, needed for other modes? */
 976     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
 977     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
 978
 979     /* get cost of BI mode */
 980     if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
 981     {
 982         /* l0 reference is halfpel, so get_ref on it will make it faster */
 983         src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
 984                         pix2, &stride2,
 985                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
 986                         16, 16 );
 987         h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
 988                         pix1, 16,
 989                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
 990                         16, 16 );
 991         src2_ref = a->l0.i_ref;
 992         pix1_ref = a->l1.i_ref;
 993     }
 994     else
 995     {
 996         /* if l0 was qpel, we'll use get_ref on l1 instead */
 997         h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
 998                         pix1, 16,
 999                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
1000                         16, 16 );
1001         src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1002                         pix2, &stride2,
1003                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1004                         16, 16 );
1005         src2_ref = a->l1.i_ref;
1006         pix1_ref = a->l0.i_ref;
1007     }
1008
1009     if( h->param.analyse.b_weighted_bipred )
1010         h->pixf.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
1011                 h->mb.bipred_weight[pix1_ref][src2_ref] );
1012     else
1013         h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
1014
1015     a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
1016                      + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
1017                                      + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
1018                      + a->l0.me16x16.cost_mv
1019                      + a->l1.me16x16.cost_mv;
1020
1021     /* mb type cost */
1022     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1023     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1024     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1025 }
1026
1027 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1028     if( x264_mb_partition_listX_table[0][part] ) \
1029     { \
1030         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1031         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
1032     } \
1033     else \
1034     { \
1035         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1036         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0, 0 ); \
1037         if( b_mvd ) \
1038             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
1039     } \
1040     if( x264_mb_partition_listX_table[1][part] ) \
1041     { \
1042         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1043         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
1044     } \
1045     else \
1046     { \
1047         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1048         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0, 0 ); \
1049         if( b_mvd ) \
1050             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
1051     }
1052
1053 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1054 {
1055     int x = (i%2)*2;
1056     int y = (i/2)*2;
1057     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1058     {
1059         x264_mb_load_mv_direct8x8( h, i );
1060         if( b_mvd )
1061         {
1062             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0, 0 );
1063             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0, 0 );
1064             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1065         }
1066     }
1067     else
1068     {
1069         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1070     }
1071 }
1072 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1073 {
1074     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1075 }
1076 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1077 {
1078     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1079 }
1080 #undef CACHE_MV_BI
1081
1082 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1083 {
1084     uint8_t **p_fref[2] =
1085         { h->mb.pic.p_fref[0][a->l0.i_ref],
1086           h->mb.pic.p_fref[1][a->l1.i_ref] };
1087     uint8_t pix[2][8*8];
1088     int i, l;
1089
1090     /* XXX Needed for x264_mb_predict_mv */
1091     h->mb.i_partition = D_8x8;
1092
1093     a->i_cost8x8bi = 0;
1094
1095     for( i = 0; i < 4; i++ )
1096     {
1097         const int x8 = i%2;
1098         const int y8 = i/2;
1099         int i_part_cost;
1100         int i_part_cost_bi = 0;
1101
1102         for( l = 0; l < 2; l++ )
1103         {
1104             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1105             x264_me_t *m = &lX->me8x8[i];
1106
1107             m->i_pixel = PIXEL_8x8;
1108             m->p_cost_mv = a->p_cost_mv;
1109
1110             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1111             LOAD_HPELS( m, p_fref[l], 8*x8, 8*y8 );
1112
1113             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1114             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1115
1116             x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
1117
1118             /* BI mode */
1119             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1120                             m->mv[0], m->mv[1], 8, 8 );
1121             i_part_cost_bi += m->cost_mv;
1122             /* FIXME: ref cost */
1123         }
1124
1125         WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
1126         i_part_cost_bi += h->pixf.satd[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
1127                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1128         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1129         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1130
1131         i_part_cost = a->l0.me8x8[i].cost;
1132         h->mb.i_sub_partition[i] = D_L0_8x8;
1133         if( a->l1.me8x8[i].cost < i_part_cost )
1134         {
1135             i_part_cost = a->l1.me8x8[i].cost;
1136             h->mb.i_sub_partition[i] = D_L1_8x8;
1137         }
1138         if( i_part_cost_bi < i_part_cost )
1139         {
1140             i_part_cost = i_part_cost_bi;
1141             h->mb.i_sub_partition[i] = D_BI_8x8;
1142         }
1143         if( a->i_cost8x8direct[i] < i_part_cost )
1144         {
1145             i_part_cost = a->i_cost8x8direct[i];
1146             h->mb.i_sub_partition[i] = D_DIRECT_8x8;
1147         }
1148         a->i_cost8x8bi += i_part_cost;
1149
1150         /* XXX Needed for x264_mb_predict_mv */
1151         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1152     }
1153
1154     /* mb type cost */
1155     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1156 }
1157
1158 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1159 {
1160     uint8_t **p_fref[2] =
1161         { h->mb.pic.p_fref[0][a->l0.i_ref],
1162           h->mb.pic.p_fref[1][a->l1.i_ref] };
1163     uint8_t pix[2][16*8];
1164     int mvc[2][2];
1165     int i, l;
1166
1167     h->mb.i_partition = D_16x8;
1168     a->i_cost16x8bi = 0;
1169
1170     for( i = 0; i < 2; i++ )
1171     {
1172         int i_part_cost;
1173         int i_part_cost_bi = 0;
1174
1175         /* TODO: check only the list(s) that were used in b8x8? */
1176         for( l = 0; l < 2; l++ )
1177         {
1178             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1179             x264_me_t *m = &lX->me16x8[i];
1180
1181             m->i_pixel = PIXEL_16x8;
1182             m->p_cost_mv = a->p_cost_mv;
1183
1184             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1185             LOAD_HPELS( m, p_fref[l], 0, 8*i );
1186
1187             mvc[0][0] = lX->me8x8[2*i].mv[0];
1188             mvc[0][1] = lX->me8x8[2*i].mv[1];
1189             mvc[1][0] = lX->me8x8[2*i+1].mv[0];
1190             mvc[1][1] = lX->me8x8[2*i+1].mv[1];
1191
1192             x264_mb_predict_mv( h, 0, 8*i, 2, m->mvp );
1193             x264_me_search( h, m, mvc, 2 );
1194
1195             /* BI mode */
1196             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
1197                             m->mv[0], m->mv[1], 16, 8 );
1198             /* FIXME: ref cost */
1199             i_part_cost_bi += m->cost_mv;
1200         }
1201
1202         WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
1203         i_part_cost_bi += h->pixf.satd[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
1204
1205         i_part_cost = a->l0.me16x8[i].cost;
1206         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1207         if( a->l1.me16x8[i].cost < i_part_cost )
1208         {
1209             i_part_cost = a->l1.me16x8[i].cost;
1210             a->i_mb_partition16x8[i] = D_L1_8x8;
1211         }
1212         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1213         {
1214             i_part_cost = i_part_cost_bi;
1215             a->i_mb_partition16x8[i] = D_BI_8x8;
1216         }
1217         a->i_cost16x8bi += i_part_cost;
1218
1219         if( i == 0 )
1220             x264_mb_cache_mv_b16x8( h, a, i, 0 );
1221     }
1222
1223     /* mb type cost */
1224     a->i_mb_type16x8 = B_L0_L0
1225         + (a->i_mb_partition16x8[0]>>2) * 3
1226         + (a->i_mb_partition16x8[1]>>2);
1227     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1228 }
1229 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1230 {
1231     uint8_t **p_fref[2] =
1232         { h->mb.pic.p_fref[0][a->l0.i_ref],
1233           h->mb.pic.p_fref[1][a->l1.i_ref] };
1234     uint8_t pix[2][8*16];
1235     int mvc[2][2];
1236     int i, l;
1237
1238     h->mb.i_partition = D_8x16;
1239     a->i_cost8x16bi = 0;
1240
1241     for( i = 0; i < 2; i++ )
1242     {
1243         int i_part_cost;
1244         int i_part_cost_bi = 0;
1245
1246         for( l = 0; l < 2; l++ )
1247         {
1248             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1249             x264_me_t *m = &lX->me8x16[i];
1250
1251             m->i_pixel = PIXEL_8x16;
1252             m->p_cost_mv = a->p_cost_mv;
1253
1254             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1255             LOAD_HPELS( m, p_fref[l], 8*i, 0 );
1256
1257             mvc[0][0] = lX->me8x8[i].mv[0];
1258             mvc[0][1] = lX->me8x8[i].mv[1];
1259             mvc[1][0] = lX->me8x8[i+2].mv[0];
1260             mvc[1][1] = lX->me8x8[i+2].mv[1];
1261
1262             x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1263             x264_me_search( h, m, mvc, 2 );
1264
1265             /* BI mode */
1266             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1267                             m->mv[0], m->mv[1], 8, 16 );
1268             /* FIXME: ref cost */
1269             i_part_cost_bi += m->cost_mv;
1270         }
1271
1272         WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
1273         i_part_cost_bi += h->pixf.satd[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
1274
1275         i_part_cost = a->l0.me8x16[i].cost;
1276         a->i_mb_partition8x16[i] = D_L0_8x8;
1277         if( a->l1.me8x16[i].cost < i_part_cost )
1278         {
1279             i_part_cost = a->l1.me8x16[i].cost;
1280             a->i_mb_partition8x16[i] = D_L1_8x8;
1281         }
1282         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1283         {
1284             i_part_cost = i_part_cost_bi;
1285             a->i_mb_partition8x16[i] = D_BI_8x8;
1286         }
1287         a->i_cost8x16bi += i_part_cost;
1288
1289         if( i == 0 )
1290             x264_mb_cache_mv_b8x16( h, a, i, 0 );
1291     }
1292
1293     /* mb type cost */
1294     a->i_mb_type8x16 = B_L0_L0
1295         + (a->i_mb_partition8x16[0]>>2) * 3
1296         + (a->i_mb_partition8x16[1]>>2);
1297     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1298 }
1299
1300 /*****************************************************************************
1301  * x264_macroblock_analyse:
1302  *****************************************************************************/
1303 void x264_macroblock_analyse( x264_t *h )
1304 {
1305     x264_mb_analysis_t analysis;
1306     int i;
1307
1308     h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp(h);
1309
1310     /* prevent QP from varying too fast. FIXME what's a sane limit? */
1311     h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->mb.qp[h->mb.i_mb_xy],
1312                                           h->mb.i_last_qp - 12, h->mb.i_last_qp + 12 );
1313
1314     /* init analysis */
1315     x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
1316
1317     /*--------------------------- Do the analysis ---------------------------*/
1318     if( h->sh.i_type == SLICE_TYPE_I )
1319     {
1320         x264_mb_analyse_intra( h, &analysis, COST_MAX );
1321
1322         if( analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
1323             h->mb.i_type = I_4x4;
1324         else
1325             h->mb.i_type = I_16x16;
1326     }
1327     else if( h->sh.i_type == SLICE_TYPE_P )
1328     {
1329         const unsigned int i_neighbour = h->mb.i_neighbour;
1330
1331         int b_skip = 0;
1332         int i_cost;
1333         int i_intra_cost, i_intra_type;
1334
1335         /* Fast P_SKIP detection */
1336         if( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
1337             ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
1338             ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
1339             ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) )
1340         {
1341             b_skip = x264_macroblock_probe_pskip( h );
1342         }
1343
1344         if( b_skip )
1345         {
1346             h->mb.i_type = P_SKIP;
1347             h->mb.i_partition = D_16x16;
1348         }
1349         else
1350         {
1351             const unsigned int flags = h->param.analyse.inter;
1352             int i_type;
1353             int i_partition;
1354
1355             x264_mb_analyse_load_costs( h, &analysis );
1356
1357             x264_mb_analyse_inter_p16x16( h, &analysis );
1358             if( flags & X264_ANALYSE_PSUB16x16 )
1359                 x264_mb_analyse_inter_p8x8( h, &analysis );
1360
1361             /* Select best inter mode */
1362             i_type = P_L0;
1363             i_partition = D_16x16;
1364             i_cost = analysis.l0.me16x16.cost;
1365
1366             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1367                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
1368             {
1369                 int i;
1370
1371                 i_type = P_8x8;
1372                 i_partition = D_8x8;
1373                 h->mb.i_sub_partition[0] = D_L0_8x8;
1374                 h->mb.i_sub_partition[1] = D_L0_8x8;
1375                 h->mb.i_sub_partition[2] = D_L0_8x8;
1376                 h->mb.i_sub_partition[3] = D_L0_8x8;
1377
1378                 i_cost = analysis.l0.i_cost8x8;
1379
1380                 /* Do sub 8x8 */
1381                 if( flags & X264_ANALYSE_PSUB8x8 )
1382                 {
1383                     for( i = 0; i < 4; i++ )
1384                     {
1385                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
1386                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
1387                         {
1388                             int i_cost8x8;
1389
1390                             h->mb.i_sub_partition[i] = D_L0_4x4;
1391                             i_cost8x8 = analysis.l0.i_cost4x4[i];
1392
1393                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
1394                             if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
1395                             {
1396                                 h->mb.i_sub_partition[i] = D_L0_8x4;
1397                                 i_cost8x8 = analysis.l0.i_cost8x4[i];
1398                             }
1399
1400                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
1401                             if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
1402                             {
1403                                 h->mb.i_sub_partition[i] = D_L0_4x8;
1404                                 i_cost8x8 = analysis.l0.i_cost4x8[i];
1405                             }
1406
1407                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
1408                         }
1409                     }
1410                 }
1411
1412                 /* Now do sub 16x8/8x16 */
1413                 x264_mb_analyse_inter_p16x8( h, &analysis );
1414                 if( analysis.l0.i_cost16x8 < i_cost )
1415                 {
1416                     i_type = P_L0;
1417                     i_partition = D_16x8;
1418                     i_cost = analysis.l0.i_cost16x8;
1419                 }
1420
1421                 x264_mb_analyse_inter_p8x16( h, &analysis );
1422                 if( analysis.l0.i_cost8x16 < i_cost )
1423                 {
1424                     i_type = P_L0;
1425                     i_partition = D_8x16;
1426                     i_cost = analysis.l0.i_cost8x16;
1427                 }
1428             }
1429
1430             h->mb.i_type = i_type;
1431             h->mb.i_partition = i_partition;
1432
1433             /* refine qpel */
1434             if( h->mb.i_partition == D_16x16 )
1435             {
1436                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1437                 i_cost = analysis.l0.me16x16.cost;
1438             }
1439             else if( h->mb.i_partition == D_16x8 )
1440             {
1441                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
1442                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
1443                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
1444             }
1445             else if( h->mb.i_partition == D_8x16 )
1446             {
1447                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
1448                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
1449                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
1450             }
1451             else if( h->mb.i_partition == D_8x8 )
1452             {
1453                 int i8x8;
1454                 i_cost = 0;
1455                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
1456                 {
1457                     switch( h->mb.i_sub_partition[i8x8] )
1458                     {
1459                         case D_L0_8x8:
1460                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
1461                             i_cost += analysis.l0.me8x8[i8x8].cost;
1462                             break;
1463                         case D_L0_8x4:
1464                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
1465                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
1466                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
1467                                       analysis.l0.me8x4[i8x8][1].cost;
1468                             break;
1469                         case D_L0_4x8:
1470                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
1471                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
1472                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
1473                                       analysis.l0.me4x8[i8x8][1].cost;
1474                             break;
1475
1476                         case D_L0_4x4:
1477                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
1478                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
1479                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
1480                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
1481                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
1482                                       analysis.l0.me4x4[i8x8][1].cost +
1483                                       analysis.l0.me4x4[i8x8][2].cost +
1484                                       analysis.l0.me4x4[i8x8][3].cost;
1485                             break;
1486                         default:
1487                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
1488                             break;
1489                     }
1490                 }
1491             }
1492
1493             x264_mb_analyse_intra( h, &analysis, i_cost );
1494             if( h->mb.b_chroma_me &&
1495                 ( analysis.i_sad_i16x16 < i_cost
1496              || ( analysis.i_sad_i4x4 < i_cost )))
1497             {
1498                 x264_mb_analyse_intra_chroma( h, &analysis );
1499                 analysis.i_sad_i16x16 += analysis.i_sad_i8x8;
1500                 analysis.i_sad_i4x4 += analysis.i_sad_i8x8;
1501             }
1502
1503             i_intra_type = I_16x16;
1504             i_intra_cost = analysis.i_sad_i16x16;
1505
1506             if( analysis.i_sad_i4x4 < i_intra_cost )
1507             {
1508                 i_intra_type = I_4x4;
1509                 i_intra_cost = analysis.i_sad_i4x4;
1510             }
1511
1512             if( i_intra_cost < i_cost )
1513             {
1514                 h->mb.i_type = i_intra_type;
1515                 i_cost = i_intra_cost;
1516             }
1517
1518             h->stat.frame.i_intra_cost += i_intra_cost;
1519             h->stat.frame.i_inter_cost += i_cost;
1520         }
1521     }
1522     else if( h->sh.i_type == SLICE_TYPE_B )
1523     {
1524         int b_skip = 0;
1525
1526         analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h );
1527         if( analysis.b_direct_available )
1528         {
1529             h->mb.i_type = B_SKIP;
1530             x264_mb_mc( h );
1531
1532             /* Conditioning the probe on neighboring block types
1533              * doesn't seem to help speed or quality. */
1534             b_skip = x264_macroblock_probe_bskip( h );
1535         }
1536
1537         if( !b_skip )
1538         {
1539             const unsigned int flags = h->param.analyse.inter;
1540             int i_partition;
1541             int i_cost;
1542
1543             x264_mb_analyse_load_costs( h, &analysis );
1544
1545             /* select best inter mode */
1546             /* direct must be first */
1547             if( analysis.b_direct_available )
1548                 x264_mb_analyse_inter_direct( h, &analysis );
1549
1550             x264_mb_analyse_inter_b16x16( h, &analysis );
1551
1552             h->mb.i_type = B_L0_L0;
1553             i_partition = D_16x16;
1554             i_cost = analysis.l0.me16x16.cost;
1555             if( analysis.l1.me16x16.cost < i_cost )
1556             {
1557                 h->mb.i_type = B_L1_L1;
1558                 i_cost = analysis.l1.me16x16.cost;
1559             }
1560             if( analysis.i_cost16x16bi < i_cost )
1561             {
1562                 h->mb.i_type = B_BI_BI;
1563                 i_cost = analysis.i_cost16x16bi;
1564             }
1565             if( analysis.i_cost16x16direct < i_cost )
1566             {
1567                 h->mb.i_type = B_DIRECT;
1568                 i_cost = analysis.i_cost16x16direct;
1569             }
1570
1571             if( flags & X264_ANALYSE_BSUB16x16 )
1572             {
1573                 x264_mb_analyse_inter_b8x8( h, &analysis );
1574                 if( analysis.i_cost8x8bi < i_cost )
1575                 {
1576                     h->mb.i_type = B_8x8;
1577                     i_partition = D_8x8;
1578                     i_cost = analysis.i_cost8x8bi;
1579
1580                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
1581                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
1582                     {
1583                         x264_mb_analyse_inter_b16x8( h, &analysis );
1584                         if( analysis.i_cost16x8bi < i_cost )
1585                         {
1586                             i_partition = D_16x8;
1587                             i_cost = analysis.i_cost16x8bi;
1588                             h->mb.i_type = analysis.i_mb_type16x8;
1589                         }
1590                     }
1591                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
1592                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
1593                     {
1594                         x264_mb_analyse_inter_b8x16( h, &analysis );
1595                         if( analysis.i_cost8x16bi < i_cost )
1596                         {
1597                             i_partition = D_8x16;
1598                             i_cost = analysis.i_cost8x16bi;
1599                             h->mb.i_type = analysis.i_mb_type8x16;
1600                         }
1601                     }
1602                 }
1603             }
1604
1605             h->mb.i_partition = i_partition;
1606
1607             /* refine qpel */
1608             if( i_partition == D_16x16 )
1609             {
1610                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1611                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1612                 if( h->mb.i_type == B_L0_L0 )
1613                 {
1614                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1615                     i_cost = analysis.l0.me16x16.cost
1616                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1617                 }
1618                 else if( h->mb.i_type == B_L1_L1 )
1619                 {
1620                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1621                     i_cost = analysis.l1.me16x16.cost
1622                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1623                 }
1624                 else if( h->mb.i_type == B_BI_BI )
1625                 {
1626                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1627                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1628                 }
1629             }
1630             else if( i_partition == D_16x8 )
1631             {
1632                 for( i=0; i<2; i++ )
1633                 {
1634                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
1635                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
1636                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
1637                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
1638                 }
1639             }
1640             else if( i_partition == D_8x16 )
1641             {
1642                 for( i=0; i<2; i++ )
1643                 {
1644                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
1645                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
1646                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
1647                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
1648                 }
1649             }
1650             else if( i_partition == D_8x8 )
1651             {
1652                 for( i=0; i<4; i++ )
1653                 {
1654                     x264_me_t *m;
1655                     int i_part_cost_old;
1656                     int i_type_cost;
1657                     int i_part_type = h->mb.i_sub_partition[i];
1658                     int b_bidir = (i_part_type == D_BI_8x8);
1659
1660                     if( i_part_type == D_DIRECT_8x8 )
1661                         continue;
1662                     if( x264_mb_partition_listX_table[0][i_part_type] )
1663                     {
1664                         m = &analysis.l0.me8x8[i];
1665                         i_part_cost_old = m->cost;
1666                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1667                         m->cost -= i_type_cost;
1668                         x264_me_refine_qpel( h, m );
1669                         if( !b_bidir )
1670                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1671                     }
1672                     if( x264_mb_partition_listX_table[1][i_part_type] )
1673                     {
1674                         m = &analysis.l1.me8x8[i];
1675                         i_part_cost_old = m->cost;
1676                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1677                         m->cost -= i_type_cost;
1678                         x264_me_refine_qpel( h, m );
1679                         if( !b_bidir )
1680                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1681                     }
1682                     /* TODO: update mvp? */
1683                 }
1684             }
1685
1686             /* best intra mode */
1687             x264_mb_analyse_intra( h, &analysis, i_cost );
1688
1689             if( analysis.i_sad_i16x16 < i_cost )
1690             {
1691                 h->mb.i_type = I_16x16;
1692                 i_cost = analysis.i_sad_i16x16;
1693             }
1694             if( analysis.i_sad_i4x4 < i_cost )
1695             {
1696                 h->mb.i_type = I_4x4;
1697                 i_cost = analysis.i_sad_i4x4;
1698             }
1699         }
1700     }
1701
1702     /*-------------------- Update MB from the analysis ----------------------*/
1703     h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
1704     switch( h->mb.i_type )
1705     {
1706         case I_4x4:
1707             for( i = 0; i < 16; i++ )
1708             {
1709                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
1710                     analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
1711             }
1712
1713             x264_mb_analyse_intra_chroma( h, &analysis );
1714             h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1715             break;
1716         case I_16x16:
1717             h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
1718
1719             x264_mb_analyse_intra_chroma( h, &analysis );
1720             h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1721             break;
1722
1723         case P_L0:
1724             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1725             switch( h->mb.i_partition )
1726             {
1727                 case D_16x16:
1728                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1729                     break;
1730
1731                 case D_16x8:
1732                     x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
1733                     x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
1734                     break;
1735
1736                 case D_8x16:
1737                     x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
1738                     x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
1739                     break;
1740
1741                 default:
1742                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
1743                     break;
1744             }
1745             break;
1746
1747         case P_8x8:
1748             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1749             for( i = 0; i < 4; i++ )
1750             {
1751                 const int x = 2*(i%2);
1752                 const int y = 2*(i/2);
1753
1754                 switch( h->mb.i_sub_partition[i] )
1755                 {
1756                     case D_L0_8x8:
1757                         x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
1758                         break;
1759                     case D_L0_8x4:
1760                         x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
1761                         x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
1762                         break;
1763                     case D_L0_4x8:
1764                         x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
1765                         x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
1766                         break;
1767                     case D_L0_4x4:
1768                         x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
1769                         x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
1770                         x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
1771                         x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
1772                         break;
1773                     default:
1774                         x264_log( h, X264_LOG_ERROR, "internal error\n" );
1775                         break;
1776                 }
1777             }
1778             break;
1779
1780         case P_SKIP:
1781         {
1782             int mvp[2];
1783             x264_mb_predict_mv_pskip( h, mvp );
1784             /* */
1785             h->mb.i_partition = D_16x16;
1786             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
1787             x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
1788             break;
1789         }
1790
1791         case B_SKIP:
1792             /* nothing has changed since x264_macroblock_probe_bskip */
1793             break;
1794         case B_DIRECT:
1795             x264_mb_load_mv_direct8x8( h, 0 );
1796             x264_mb_load_mv_direct8x8( h, 1 );
1797             x264_mb_load_mv_direct8x8( h, 2 );
1798             x264_mb_load_mv_direct8x8( h, 3 );
1799             break;
1800
1801         case B_8x8:
1802             /* optimize: cache might not need to be rewritten */
1803             for( i = 0; i < 4; i++ )
1804                 x264_mb_cache_mv_b8x8( h, &analysis, i, 1 );
1805             break;
1806
1807         default: /* the rest of the B types */
1808             switch( h->mb.i_partition )
1809             {
1810             case D_16x16:
1811                 switch( h->mb.i_type )
1812                 {
1813                 case B_L0_L0:
1814                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1815                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1816
1817                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
1818                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
1819                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
1820                     break;
1821                 case B_L1_L1:
1822                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
1823                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
1824                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
1825
1826                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1827                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1828                     break;
1829                 case B_BI_BI:
1830                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1831                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1832
1833                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1834                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1835                     break;
1836                 }
1837                 break;
1838             case D_16x8:
1839                 x264_mb_cache_mv_b16x8( h, &analysis, 0, 1 );
1840                 x264_mb_cache_mv_b16x8( h, &analysis, 1, 1 );
1841                 break;
1842             case D_8x16:
1843                 x264_mb_cache_mv_b8x16( h, &analysis, 0, 1 );
1844                 x264_mb_cache_mv_b8x16( h, &analysis, 1, 1 );
1845                 break;
1846             default:
1847                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
1848                 break;
1849             }
1850     }
1851 }
1852
1853 #include "slicetype_decision.c"
1854