git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27 #include <math.h>
  28 #include <limits.h>
  29
  30 #include "common/common.h"
  31 #include "common/macroblock.h"
  32 #include "macroblock.h"
  33 #include "me.h"
  34 #include "ratecontrol.h"
  35
  36 typedef struct
  37 {
  38     /* 16x16 */
  39     int i_ref;
  40     x264_me_t me16x16;
  41
  42     /* 8x8 */
  43     int       i_cost8x8;
  44     x264_me_t me8x8[4];
  45
  46     /* Sub 4x4 */
  47     int       i_cost4x4[4]; /* cost per 8x8 partition */
  48     x264_me_t me4x4[4][4];
  49
  50     /* Sub 8x4 */
  51     int       i_cost8x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me8x4[4][2];
  53
  54     /* Sub 4x8 */
  55     int       i_cost4x8[4]; /* cost per 8x8 partition */
  56     x264_me_t me4x8[4][4];
  57
  58     /* 16x8 */
  59     int       i_cost16x8;
  60     x264_me_t me16x8[2];
  61
  62     /* 8x16 */
  63     int       i_cost8x16;
  64     x264_me_t me8x16[2];
  65
  66 } x264_mb_analysis_list_t;
  67
  68 typedef struct
  69 {
  70     /* conduct the analysis using this lamda and QP */
  71     int i_lambda;
  72     int i_qp;
  73     int16_t *p_cost_mv;
  74
  75
  76     /* I: Intra part */
  77     /* Take some shortcuts in intra search if intra is deemed unlikely */
  78     int b_fast_intra;
  79
  80     /* Luma part 16x16 and 4x4 modes stats */
  81     int i_sad_i16x16;
  82     int i_predict16x16;
  83
  84     int i_sad_i4x4;
  85     int i_predict4x4[4][4];
  86
  87     /* Chroma part */
  88     int i_sad_i8x8;
  89     int i_predict8x8;
  90
  91     /* II: Inter part P/B frame */
  92     x264_mb_analysis_list_t l0;
  93     x264_mb_analysis_list_t l1;
  94
  95     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
  96     int i_cost16x16direct;
  97     int i_cost8x8bi;
  98     int i_cost8x8direct[4];
  99     int i_cost16x8bi;
 100     int i_cost8x16bi;
 101
 102     int i_mb_partition16x8[2]; /* mb_partition_e */
 103     int i_mb_partition8x16[2];
 104     int i_mb_type16x8; /* mb_class_e */
 105     int i_mb_type8x16;
 106
 107     int b_direct_available;
 108
 109 } x264_mb_analysis_t;
 110
 111 static const int i_qp0_cost_table[52] = {
 112    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 113    1, 1, 1, 1,              /*  8-11 */
 114    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 115    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 116    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 117   16,18,20,23,25,29,32,36,  /* 36-43 */
 118   40,45,51,57,64,72,81,91   /* 44-51 */
 119 };
 120
 121 static const uint8_t block_idx_x[16] = {
 122     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
 123 };
 124 static const uint8_t block_idx_y[16] = {
 125     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
 126 };
 127
 128 /* TODO: calculate CABAC costs */
 129 static const int i_mb_b_cost_table[18] = {
 130     9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 131 };
 132 static const int i_mb_b16x8_cost_table[16] = {
 133     0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 134 };
 135 static const int i_sub_mb_b_cost_table[13] = {
 136     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 137 };
 138 static const int i_sub_mb_p_cost_table[4] = {
 139     5, 3, 3, 1
 140 };
 141
 142 /* initialize an array of lambda*nbits for all possible mvs */
 143 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 144 {
 145     static int16_t *p_cost_mv[52];
 146
 147     if( !p_cost_mv[a->i_qp] )
 148     {
 149         /* could be faster, but isn't called many times */
 150         /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 151         int i;
 152         p_cost_mv[a->i_qp] = x264_malloc( (4*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) );
 153         p_cost_mv[a->i_qp] += 2*4*h->param.analyse.i_mv_range;
 154         for( i = 0; i <= 2*4*h->param.analyse.i_mv_range; i++ )
 155         {
 156             p_cost_mv[a->i_qp][-i] =
 157             p_cost_mv[a->i_qp][i]  = a->i_lambda * bs_size_se( i );
 158         }
 159     }
 160
 161     a->p_cost_mv = p_cost_mv[a->i_qp];
 162 }
 163
 164 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 165 {
 166     memset( a, 0, sizeof( x264_mb_analysis_t ) );
 167
 168     /* conduct the analysis using this lamda and QP */
 169     a->i_qp = i_qp;
 170     a->i_lambda = i_qp0_cost_table[i_qp];
 171
 172     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 173     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 174                         && h->mb.i_subpel_refine >= 5;
 175     a->b_fast_intra = 0;
 176
 177     /* I: Intra part */
 178     a->i_sad_i16x16 =
 179     a->i_sad_i4x4   =
 180     a->i_sad_i8x8   = COST_MAX;
 181
 182     /* II: Inter part P/B frame */
 183     if( h->sh.i_type != SLICE_TYPE_I )
 184     {
 185         int i;
 186         int i_fmv_range = h->param.analyse.i_mv_range - 16;
 187
 188         /* Calculate max allowed MV range */
 189 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
 190         h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
 191         h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 );
 192         h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
 193         h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
 194         if( h->mb.i_mb_x == 0)
 195         {
 196             h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
 197             h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 );
 198             h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
 199             h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
 200         }
 201 #undef CLIP_FMV
 202
 203         a->l0.me16x16.cost =
 204         a->l0.i_cost8x8    = COST_MAX;
 205
 206         for( i = 0; i < 4; i++ )
 207         {
 208             a->l0.i_cost4x4[i] =
 209             a->l0.i_cost8x4[i] =
 210             a->l0.i_cost4x8[i] = COST_MAX;
 211         }
 212
 213         a->l0.i_cost16x8   =
 214         a->l0.i_cost8x16   = COST_MAX;
 215         if( h->sh.i_type == SLICE_TYPE_B )
 216         {
 217             a->l1.me16x16.cost =
 218             a->l1.i_cost8x8    = COST_MAX;
 219
 220             for( i = 0; i < 4; i++ )
 221             {
 222                 a->l1.i_cost4x4[i] =
 223                 a->l1.i_cost8x4[i] =
 224                 a->l1.i_cost4x8[i] =
 225                 a->i_cost8x8direct[i] = COST_MAX;
 226             }
 227
 228             a->l1.i_cost16x8   =
 229             a->l1.i_cost8x16   =
 230
 231             a->i_cost16x16bi   =
 232             a->i_cost16x16direct =
 233             a->i_cost8x8bi     =
 234             a->i_cost16x8bi    =
 235             a->i_cost8x16bi    = COST_MAX;
 236         }
 237
 238         /* Fast intra decision */
 239         if( h->mb.i_mb_xy > 4 )
 240         {
 241             const unsigned int i_neighbour = h->mb.i_neighbour;
 242             if(   ((i_neighbour&MB_LEFT) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - 1] ))
 243                || ((i_neighbour&MB_TOP) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] ))
 244                || (((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT)) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] ))
 245                || ((i_neighbour&MB_TOPRIGHT) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] ))
 246                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 247                || (h->mb.i_mb_xy < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_16x16])) )
 248             { /* intra is likely */ }
 249             else
 250             {
 251                 a->b_fast_intra = 1;
 252             }
 253         }
 254     }
 255 }
 256
 257
 258
 259 /*
 260  * Handle intra mb
 261  */
 262 /* Max = 4 */
 263 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 264 {
 265     if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
 266     {
 267         /* top and left avaible */
 268         *mode++ = I_PRED_16x16_V;
 269         *mode++ = I_PRED_16x16_H;
 270         *mode++ = I_PRED_16x16_DC;
 271         *mode++ = I_PRED_16x16_P;
 272         *pi_count = 4;
 273     }
 274     else if( ( i_neighbour & MB_LEFT ) )
 275     {
 276         /* left available*/
 277         *mode++ = I_PRED_16x16_DC_LEFT;
 278         *mode++ = I_PRED_16x16_H;
 279         *pi_count = 2;
 280     }
 281     else if( ( i_neighbour & MB_TOP ) )
 282     {
 283         /* top available*/
 284         *mode++ = I_PRED_16x16_DC_TOP;
 285         *mode++ = I_PRED_16x16_V;
 286         *pi_count = 2;
 287     }
 288     else
 289     {
 290         /* none avaible */
 291         *mode = I_PRED_16x16_DC_128;
 292         *pi_count = 1;
 293     }
 294 }
 295
 296 /* Max = 4 */
 297 static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 298 {
 299     if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
 300     {
 301         /* top and left avaible */
 302         *mode++ = I_PRED_CHROMA_V;
 303         *mode++ = I_PRED_CHROMA_H;
 304         *mode++ = I_PRED_CHROMA_DC;
 305         *mode++ = I_PRED_CHROMA_P;
 306         *pi_count = 4;
 307     }
 308     else if( ( i_neighbour & MB_LEFT ) )
 309     {
 310         /* left available*/
 311         *mode++ = I_PRED_CHROMA_DC_LEFT;
 312         *mode++ = I_PRED_CHROMA_H;
 313         *pi_count = 2;
 314     }
 315     else if( ( i_neighbour & MB_TOP ) )
 316     {
 317         /* top available*/
 318         *mode++ = I_PRED_CHROMA_DC_TOP;
 319         *mode++ = I_PRED_CHROMA_V;
 320         *pi_count = 2;
 321     }
 322     else
 323     {
 324         /* none avaible */
 325         *mode = I_PRED_CHROMA_DC_128;
 326         *pi_count = 1;
 327     }
 328 }
 329
 330 /* MAX = 8 */
 331 static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
 332 {
 333     int b_a, b_b, b_c;
 334     static const unsigned int needmb[16] =
 335     {
 336         MB_LEFT|MB_TOP, MB_TOP,
 337         MB_LEFT,        MB_PRIVATE,
 338         MB_TOP,         MB_TOP|MB_TOPRIGHT,
 339         0,              MB_PRIVATE,
 340         MB_LEFT,        0,
 341         MB_LEFT,        MB_PRIVATE,
 342         0,              MB_PRIVATE,
 343         0,              MB_PRIVATE
 344     };
 345
 346     /* FIXME even when b_c == 0 there is some case where missing pixels
 347      * are emulated and thus more mode are available TODO
 348      * analysis and encode should be fixed too */
 349     b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
 350     b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
 351     b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
 352
 353     if( b_a && b_b )
 354     {
 355         *mode++ = I_PRED_4x4_DC;
 356         *mode++ = I_PRED_4x4_H;
 357         *mode++ = I_PRED_4x4_V;
 358         *mode++ = I_PRED_4x4_DDR;
 359         *mode++ = I_PRED_4x4_VR;
 360         *mode++ = I_PRED_4x4_HD;
 361         *mode++ = I_PRED_4x4_HU;
 362
 363         *pi_count = 7;
 364
 365         if( b_c )
 366         {
 367             *mode++ = I_PRED_4x4_DDL;
 368             *mode++ = I_PRED_4x4_VL;
 369             (*pi_count) += 2;
 370         }
 371     }
 372     else if( b_a && !b_b )
 373     {
 374         *mode++ = I_PRED_4x4_DC_LEFT;
 375         *mode++ = I_PRED_4x4_H;
 376         *mode++ = I_PRED_4x4_HU;
 377         *pi_count = 3;
 378     }
 379     else if( !b_a && b_b )
 380     {
 381         *mode++ = I_PRED_4x4_DC_TOP;
 382         *mode++ = I_PRED_4x4_V;
 383         *pi_count = 2;
 384     }
 385     else
 386     {
 387         *mode++ = I_PRED_4x4_DC_128;
 388         *pi_count = 1;
 389     }
 390 }
 391
 392 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter )
 393 {
 394     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 395     const int i_stride = h->mb.pic.i_stride[0];
 396     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 397     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 398
 399     int i, idx;
 400
 401     int i_max;
 402     int predict_mode[9];
 403
 404     /*---------------- Try all mode and calculate their score ---------------*/
 405
 406     /* 16x16 prediction selection */
 407     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 408     for( i = 0; i < i_max; i++ )
 409     {
 410         int i_sad;
 411         int i_mode;
 412
 413         i_mode = predict_mode[i];
 414
 415         /* we do the prediction */
 416         h->predict_16x16[i_mode]( p_dst, i_stride );
 417
 418         /* we calculate the diff and get the square sum of the diff */
 419         i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
 420                 res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 421         /* if i_score is lower it is better */
 422         if( res->i_sad_i16x16 > i_sad )
 423         {
 424             res->i_predict16x16 = i_mode;
 425             res->i_sad_i16x16     = i_sad;
 426         }
 427     }
 428     /* cavlc mb type prefix */
 429     if( h->sh.i_type == SLICE_TYPE_B )
 430         res->i_sad_i16x16 += res->i_lambda * i_mb_b_cost_table[I_16x16];
 431
 432     if( res->b_fast_intra )
 433     {
 434         if( res->i_sad_i16x16 > 2*i_cost_inter )
 435             return;
 436     }
 437
 438     /* 4x4 prediction selection */
 439     if( flags & X264_ANALYSE_I4x4 )
 440     {
 441         res->i_sad_i4x4 = 0;
 442         for( idx = 0; idx < 16; idx++ )
 443         {
 444             uint8_t *p_src_by;
 445             uint8_t *p_dst_by;
 446             int     i_best;
 447             int x, y;
 448             int i_pred_mode;
 449
 450             i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
 451             x = block_idx_x[idx];
 452             y = block_idx_y[idx];
 453
 454             p_src_by = p_src + 4 * x + 4 * y * i_stride;
 455             p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
 456
 457             i_best = COST_MAX;
 458             predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
 459             for( i = 0; i < i_max; i++ )
 460             {
 461                 int i_sad;
 462                 int i_mode;
 463
 464                 i_mode = predict_mode[i];
 465
 466                 /* we do the prediction */
 467                 h->predict_4x4[i_mode]( p_dst_by, i_stride );
 468
 469                 /* we calculate diff and get the square sum of the diff */
 470                 i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
 471                                                  p_src_by, i_stride );
 472
 473                 i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
 474
 475                 /* if i_score is lower it is better */
 476                 if( i_best > i_sad )
 477                 {
 478                     res->i_predict4x4[x][y] = i_mode;
 479                     i_best = i_sad;
 480                 }
 481             }
 482             res->i_sad_i4x4 += i_best;
 483
 484             /* we need to encode this mb now (for next ones) */
 485             h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
 486             x264_mb_encode_i4x4( h, idx, res->i_qp );
 487
 488             /* we need to store the 'fixed' version */
 489             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
 490                 x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
 491         }
 492         res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
 493         if( h->sh.i_type == SLICE_TYPE_B )
 494             res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
 495     }
 496 }
 497
 498 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
 499 {
 500     int i;
 501
 502     int i_max;
 503     int predict_mode[9];
 504
 505     uint8_t *p_dstc[2], *p_srcc[2];
 506     int      i_stride[2];
 507
 508     if( res->i_sad_i8x8 < COST_MAX )
 509         return;
 510
 511     /* 8x8 prediction selection for chroma */
 512     p_dstc[0] = h->mb.pic.p_fdec[1];
 513     p_dstc[1] = h->mb.pic.p_fdec[2];
 514     p_srcc[0] = h->mb.pic.p_fenc[1];
 515     p_srcc[1] = h->mb.pic.p_fenc[2];
 516
 517     i_stride[0] = h->mb.pic.i_stride[1];
 518     i_stride[1] = h->mb.pic.i_stride[2];
 519
 520     predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 521     res->i_sad_i8x8 = COST_MAX;
 522     for( i = 0; i < i_max; i++ )
 523     {
 524         int i_sad;
 525         int i_mode;
 526
 527         i_mode = predict_mode[i];
 528
 529         /* we do the prediction */
 530         h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
 531         h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
 532
 533         /* we calculate the cost */
 534         i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
 535                                          p_srcc[0], i_stride[0] ) +
 536                 h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
 537                                          p_srcc[1], i_stride[1] ) +
 538                 res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
 539
 540         /* if i_score is lower it is better */
 541         if( res->i_sad_i8x8 > i_sad )
 542         {
 543             res->i_predict8x8 = i_mode;
 544             res->i_sad_i8x8     = i_sad;
 545         }
 546     }
 547 }
 548
 549 #define LOAD_FENC( m, src, xoff, yoff) \
 550     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
 551     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
 552     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 553     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 554     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 555 #define LOAD_HPELS(m, src, xoff, yoff) \
 556     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
 557     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
 558     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
 559     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
 560     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
 561     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 562
 563 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 564 {
 565     x264_me_t m;
 566     int i_ref;
 567     int mvc[4][2], i_mvc;
 568     int i_fullpel_thresh = INT_MAX;
 569     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 570
 571     /* 16x16 Search on all ref frame */
 572     m.i_pixel = PIXEL_16x16;
 573     m.p_cost_mv = a->p_cost_mv;
 574     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 575
 576     a->l0.me16x16.cost = INT_MAX;
 577     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 578     {
 579         const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
 580         i_fullpel_thresh -= i_ref_cost;
 581
 582         /* search with ref */
 583         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 584         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 585         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 586         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 587
 588         m.cost += i_ref_cost;
 589         i_fullpel_thresh += i_ref_cost;
 590
 591         if( m.cost < a->l0.me16x16.cost )
 592         {
 593             a->l0.i_ref = i_ref;
 594             a->l0.me16x16 = m;
 595         }
 596
 597         /* save mv for predicting neighbors */
 598         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 599         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 600     }
 601
 602     /* subtract ref cost, so we don't have to add it for the other P types */
 603     a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
 604
 605     /* Set global ref, needed for all others modes */
 606     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
 607 }
 608
 609 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 610 {
 611     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 612     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 613     int mvc[5][2], i_mvc;
 614     int i;
 615
 616     /* XXX Needed for x264_mb_predict_mv */
 617     h->mb.i_partition = D_8x8;
 618
 619     i_mvc = 1;
 620     mvc[0][0] = a->l0.me16x16.mv[0];
 621     mvc[0][1] = a->l0.me16x16.mv[1];
 622
 623     for( i = 0; i < 4; i++ )
 624     {
 625         x264_me_t *m = &a->l0.me8x8[i];
 626         const int x8 = i%2;
 627         const int y8 = i/2;
 628
 629         m->i_pixel = PIXEL_8x8;
 630         m->p_cost_mv = a->p_cost_mv;
 631
 632         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
 633         LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
 634
 635         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 636         x264_me_search( h, m, mvc, i_mvc );
 637
 638         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 639
 640         mvc[i_mvc][0] = m->mv[0];
 641         mvc[i_mvc][1] = m->mv[1];
 642         i_mvc++;
 643
 644         /* mb type cost */
 645         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
 646     }
 647
 648     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
 649                    a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
 650 }
 651
 652 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 653 {
 654     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 655     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 656     int mvc[2][2];
 657     int i;
 658
 659     /* XXX Needed for x264_mb_predict_mv */
 660     h->mb.i_partition = D_16x8;
 661
 662     for( i = 0; i < 2; i++ )
 663     {
 664         x264_me_t *m = &a->l0.me16x8[i];
 665
 666         m->i_pixel = PIXEL_16x8;
 667         m->p_cost_mv = a->p_cost_mv;
 668
 669         LOAD_FENC( m, p_fenc, 0, 8*i );
 670         LOAD_HPELS( m, p_fref, 0, 8*i );
 671
 672         mvc[0][0] = a->l0.me8x8[2*i].mv[0];
 673         mvc[0][1] = a->l0.me8x8[2*i].mv[1];
 674         mvc[1][0] = a->l0.me8x8[2*i+1].mv[0];
 675         mvc[1][1] = a->l0.me8x8[2*i+1].mv[1];
 676
 677         x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
 678         x264_me_search( h, m, mvc, 2 );
 679
 680         x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
 681     }
 682
 683     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
 684 }
 685
 686 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 687 {
 688     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 689     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 690     int mvc[2][2];
 691     int i;
 692
 693     /* XXX Needed for x264_mb_predict_mv */
 694     h->mb.i_partition = D_8x16;
 695
 696     for( i = 0; i < 2; i++ )
 697     {
 698         x264_me_t *m = &a->l0.me8x16[i];
 699
 700         m->i_pixel = PIXEL_8x16;
 701         m->p_cost_mv = a->p_cost_mv;
 702
 703         LOAD_FENC( m, p_fenc, 8*i, 0 );
 704         LOAD_HPELS( m, p_fref, 8*i, 0 );
 705
 706         mvc[0][0] = a->l0.me8x8[i].mv[0];
 707         mvc[0][1] = a->l0.me8x8[i].mv[1];
 708         mvc[1][0] = a->l0.me8x8[i+2].mv[0];
 709         mvc[1][1] = a->l0.me8x8[i+2].mv[1];
 710
 711         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
 712         x264_me_search( h, m, mvc, 2 );
 713
 714         x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
 715     }
 716
 717     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
 718 }
 719
 720 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
 721 {
 722     uint8_t pix1[8*8], pix2[8*8];
 723     const int i_stride = h->mb.pic.i_stride[1];
 724     const int off = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
 725
 726 #define CHROMA4x4MC( width, height, me, x, y ) \
 727     h->mc.mc_chroma( &p_fref[4][off+x+y*i_stride], i_stride, &pix1[x+y*8], 8, (me).mv[0], (me).mv[1], width, height ); \
 728     h->mc.mc_chroma( &p_fref[5][off+x+y*i_stride], i_stride, &pix2[x+y*8], 8, (me).mv[0], (me).mv[1], width, height );
 729
 730     if( pixel == PIXEL_4x4 )
 731     {
 732         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
 733         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 0,2 );
 734         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 2,0 );
 735         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
 736     }
 737     else if( pixel == PIXEL_8x4 )
 738     {
 739         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
 740         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
 741     }
 742     else
 743     {
 744         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
 745         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
 746     }
 747
 748     return h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
 749          + h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
 750 }
 751
 752 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 753 {
 754     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 755     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 756
 757     int i4x4;
 758
 759     /* XXX Needed for x264_mb_predict_mv */
 760     h->mb.i_partition = D_8x8;
 761
 762     for( i4x4 = 0; i4x4 < 4; i4x4++ )
 763     {
 764         const int idx = 4*i8x8 + i4x4;
 765         const int x4 = block_idx_x[idx];
 766         const int y4 = block_idx_y[idx];
 767         const int i_mvc = (i4x4 == 0);
 768
 769         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
 770
 771         m->i_pixel = PIXEL_4x4;
 772         m->p_cost_mv = a->p_cost_mv;
 773
 774         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 775         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 776
 777         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
 778         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
 779
 780         x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
 781     }
 782
 783     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
 784                          a->l0.me4x4[i8x8][1].cost +
 785                          a->l0.me4x4[i8x8][2].cost +
 786                          a->l0.me4x4[i8x8][3].cost +
 787                          a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
 788     if( h->mb.b_chroma_me )
 789         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
 790 }
 791
 792 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 793 {
 794     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 795     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 796
 797     int i8x4;
 798
 799     /* XXX Needed for x264_mb_predict_mv */
 800     h->mb.i_partition = D_8x8;
 801
 802     for( i8x4 = 0; i8x4 < 2; i8x4++ )
 803     {
 804         const int idx = 4*i8x8 + 2*i8x4;
 805         const int x4 = block_idx_x[idx];
 806         const int y4 = block_idx_y[idx];
 807         const int i_mvc = (i8x4 == 0);
 808
 809         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
 810
 811         m->i_pixel = PIXEL_8x4;
 812         m->p_cost_mv = a->p_cost_mv;
 813
 814         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 815         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 816
 817         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
 818         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 819
 820         x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
 821     }
 822
 823     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
 824                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
 825     if( h->mb.b_chroma_me )
 826         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
 827 }
 828
 829 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 830 {
 831     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
 832     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 833
 834     int i4x8;
 835
 836     /* XXX Needed for x264_mb_predict_mv */
 837     h->mb.i_partition = D_8x8;
 838
 839     for( i4x8 = 0; i4x8 < 2; i4x8++ )
 840     {
 841         const int idx = 4*i8x8 + i4x8;
 842         const int x4 = block_idx_x[idx];
 843         const int y4 = block_idx_y[idx];
 844         const int i_mvc = (i4x8 == 0);
 845
 846         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
 847
 848         m->i_pixel = PIXEL_4x8;
 849         m->p_cost_mv = a->p_cost_mv;
 850
 851         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
 852         LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
 853
 854         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
 855         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 856
 857         x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
 858     }
 859
 860     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
 861                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
 862     if( h->mb.b_chroma_me )
 863         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
 864 }
 865
 866 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 867 {
 868     /* Assumes that fdec still contains the results of
 869      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
 870
 871     uint8_t **p_fenc = h->mb.pic.p_fenc;
 872     uint8_t **p_fdec = h->mb.pic.p_fdec;
 873     int i_stride= h->mb.pic.i_stride[0];
 874     int i;
 875
 876     a->i_cost16x16direct = 0;
 877     for( i = 0; i < 4; i++ )
 878     {
 879         const int x8 = i%2;
 880         const int y8 = i/2;
 881         const int off = 8 * x8 + 8 * i_stride * y8;
 882         a->i_cost16x16direct +=
 883         a->i_cost8x8direct[i] =
 884             h->pixf.satd[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
 885
 886         /* mb type cost */
 887         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
 888     }
 889
 890     a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
 891 }
 892
 893 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
 894     { \
 895         if( h->param.analyse.b_weighted_bipred ) \
 896             h->pixf.avg_weight[size]( pix1, stride1, src2, stride2, \
 897                     h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
 898         else \
 899             h->pixf.avg[size]( pix1, stride1, src2, stride2 ); \
 900     }
 901
 902 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 903 {
 904     uint8_t pix1[16*16], pix2[16*16];
 905     uint8_t *src2;
 906     int stride2 = 16;
 907     int src2_ref, pix1_ref;
 908
 909     x264_me_t m;
 910     int i_ref;
 911     int mvc[5][2], i_mvc;
 912     int i_fullpel_thresh = INT_MAX;
 913     int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
 914
 915     /* 16x16 Search on all ref frame */
 916     m.i_pixel = PIXEL_16x16;
 917     m.p_cost_mv = a->p_cost_mv;
 918     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 919
 920     /* ME for List 0 */
 921     a->l0.me16x16.cost = INT_MAX;
 922     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
 923     {
 924         /* search with ref */
 925         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
 926         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
 927         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
 928         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 929
 930         /* add ref cost */
 931         m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
 932
 933         if( m.cost < a->l0.me16x16.cost )
 934         {
 935             a->l0.i_ref = i_ref;
 936             a->l0.me16x16 = m;
 937         }
 938
 939         /* save mv for predicting neighbors */
 940         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 941         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 942     }
 943     /* subtract ref cost, so we don't have to add it for the other MB types */
 944     a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
 945
 946     /* ME for list 1 */
 947     i_fullpel_thresh = INT_MAX;
 948     p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL;
 949     a->l1.me16x16.cost = INT_MAX;
 950     for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
 951     {
 952         /* search with ref */
 953         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 );
 954         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
 955         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
 956         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 957
 958         /* add ref cost */
 959         m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
 960
 961         if( m.cost < a->l1.me16x16.cost )
 962         {
 963             a->l1.i_ref = i_ref;
 964             a->l1.me16x16 = m;
 965         }
 966
 967         /* save mv for predicting neighbors */
 968         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
 969         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
 970     }
 971     /* subtract ref cost, so we don't have to add it for the other MB types */
 972     a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
 973
 974     /* Set global ref, needed for other modes? */
 975     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
 976     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
 977
 978     /* get cost of BI mode */
 979     if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
 980     {
 981         /* l0 reference is halfpel, so get_ref on it will make it faster */
 982         src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
 983                         pix2, &stride2,
 984                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
 985                         16, 16 );
 986         h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
 987                         pix1, 16,
 988                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
 989                         16, 16 );
 990         src2_ref = a->l0.i_ref;
 991         pix1_ref = a->l1.i_ref;
 992     }
 993     else
 994     {
 995         /* if l0 was qpel, we'll use get_ref on l1 instead */
 996         h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
 997                         pix1, 16,
 998                         a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
 999                         16, 16 );
1000         src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1001                         pix2, &stride2,
1002                         a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1003                         16, 16 );
1004         src2_ref = a->l1.i_ref;
1005         pix1_ref = a->l0.i_ref;
1006     }
1007
1008     if( h->param.analyse.b_weighted_bipred )
1009         h->pixf.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
1010                 h->mb.bipred_weight[pix1_ref][src2_ref] );
1011     else
1012         h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
1013
1014     a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
1015                      + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
1016                                      + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
1017                      + a->l0.me16x16.cost_mv
1018                      + a->l1.me16x16.cost_mv;
1019
1020     /* mb type cost */
1021     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1022     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1023     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1024 }
1025
1026 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1027     if( x264_mb_partition_listX_table[0][part] ) \
1028     { \
1029         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1030         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
1031     } \
1032     else \
1033     { \
1034         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1035         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0, 0 ); \
1036         if( b_mvd ) \
1037             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
1038     } \
1039     if( x264_mb_partition_listX_table[1][part] ) \
1040     { \
1041         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1042         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
1043     } \
1044     else \
1045     { \
1046         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1047         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0, 0 ); \
1048         if( b_mvd ) \
1049             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
1050     }
1051
1052 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1053 {
1054     int x = (i%2)*2;
1055     int y = (i/2)*2;
1056     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1057     {
1058         x264_mb_load_mv_direct8x8( h, i );
1059         if( b_mvd )
1060         {
1061             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0, 0 );
1062             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0, 0 );
1063             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1064         }
1065     }
1066     else
1067     {
1068         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1069     }
1070 }
1071 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1072 {
1073     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1074 }
1075 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1076 {
1077     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1078 }
1079 #undef CACHE_MV_BI
1080
1081 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1082 {
1083     uint8_t **p_fref[2] =
1084         { h->mb.pic.p_fref[0][a->l0.i_ref],
1085           h->mb.pic.p_fref[1][a->l1.i_ref] };
1086     uint8_t pix[2][8*8];
1087     int i, l;
1088
1089     /* XXX Needed for x264_mb_predict_mv */
1090     h->mb.i_partition = D_8x8;
1091
1092     a->i_cost8x8bi = 0;
1093
1094     for( i = 0; i < 4; i++ )
1095     {
1096         const int x8 = i%2;
1097         const int y8 = i/2;
1098         int i_part_cost;
1099         int i_part_cost_bi = 0;
1100
1101         for( l = 0; l < 2; l++ )
1102         {
1103             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1104             x264_me_t *m = &lX->me8x8[i];
1105
1106             m->i_pixel = PIXEL_8x8;
1107             m->p_cost_mv = a->p_cost_mv;
1108
1109             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1110             LOAD_HPELS( m, p_fref[l], 8*x8, 8*y8 );
1111
1112             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1113             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1114
1115             x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
1116
1117             /* BI mode */
1118             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1119                             m->mv[0], m->mv[1], 8, 8 );
1120             i_part_cost_bi += m->cost_mv;
1121             /* FIXME: ref cost */
1122         }
1123
1124         WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
1125         i_part_cost_bi += h->pixf.satd[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
1126                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1127         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1128         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1129
1130         i_part_cost = a->l0.me8x8[i].cost;
1131         h->mb.i_sub_partition[i] = D_L0_8x8;
1132         if( a->l1.me8x8[i].cost < i_part_cost )
1133         {
1134             i_part_cost = a->l1.me8x8[i].cost;
1135             h->mb.i_sub_partition[i] = D_L1_8x8;
1136         }
1137         if( i_part_cost_bi < i_part_cost )
1138         {
1139             i_part_cost = i_part_cost_bi;
1140             h->mb.i_sub_partition[i] = D_BI_8x8;
1141         }
1142         if( a->i_cost8x8direct[i] < i_part_cost )
1143         {
1144             i_part_cost = a->i_cost8x8direct[i];
1145             h->mb.i_sub_partition[i] = D_DIRECT_8x8;
1146         }
1147         a->i_cost8x8bi += i_part_cost;
1148
1149         /* XXX Needed for x264_mb_predict_mv */
1150         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1151     }
1152
1153     /* mb type cost */
1154     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1155 }
1156
1157 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1158 {
1159     uint8_t **p_fref[2] =
1160         { h->mb.pic.p_fref[0][a->l0.i_ref],
1161           h->mb.pic.p_fref[1][a->l1.i_ref] };
1162     uint8_t pix[2][16*8];
1163     int mvc[2][2];
1164     int i, l;
1165
1166     h->mb.i_partition = D_16x8;
1167     a->i_cost16x8bi = 0;
1168
1169     for( i = 0; i < 2; i++ )
1170     {
1171         int i_part_cost;
1172         int i_part_cost_bi = 0;
1173
1174         /* TODO: check only the list(s) that were used in b8x8? */
1175         for( l = 0; l < 2; l++ )
1176         {
1177             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1178             x264_me_t *m = &lX->me16x8[i];
1179
1180             m->i_pixel = PIXEL_16x8;
1181             m->p_cost_mv = a->p_cost_mv;
1182
1183             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1184             LOAD_HPELS( m, p_fref[l], 0, 8*i );
1185
1186             mvc[0][0] = lX->me8x8[2*i].mv[0];
1187             mvc[0][1] = lX->me8x8[2*i].mv[1];
1188             mvc[1][0] = lX->me8x8[2*i+1].mv[0];
1189             mvc[1][1] = lX->me8x8[2*i+1].mv[1];
1190
1191             x264_mb_predict_mv( h, 0, 8*i, 2, m->mvp );
1192             x264_me_search( h, m, mvc, 2 );
1193
1194             /* BI mode */
1195             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
1196                             m->mv[0], m->mv[1], 16, 8 );
1197             /* FIXME: ref cost */
1198             i_part_cost_bi += m->cost_mv;
1199         }
1200
1201         WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
1202         i_part_cost_bi += h->pixf.satd[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
1203
1204         i_part_cost = a->l0.me16x8[i].cost;
1205         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1206         if( a->l1.me16x8[i].cost < i_part_cost )
1207         {
1208             i_part_cost = a->l1.me16x8[i].cost;
1209             a->i_mb_partition16x8[i] = D_L1_8x8;
1210         }
1211         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1212         {
1213             i_part_cost = i_part_cost_bi;
1214             a->i_mb_partition16x8[i] = D_BI_8x8;
1215         }
1216         a->i_cost16x8bi += i_part_cost;
1217
1218         if( i == 0 )
1219             x264_mb_cache_mv_b16x8( h, a, i, 0 );
1220     }
1221
1222     /* mb type cost */
1223     a->i_mb_type16x8 = B_L0_L0
1224         + (a->i_mb_partition16x8[0]>>2) * 3
1225         + (a->i_mb_partition16x8[1]>>2);
1226     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1227 }
1228 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1229 {
1230     uint8_t **p_fref[2] =
1231         { h->mb.pic.p_fref[0][a->l0.i_ref],
1232           h->mb.pic.p_fref[1][a->l1.i_ref] };
1233     uint8_t pix[2][8*16];
1234     int mvc[2][2];
1235     int i, l;
1236
1237     h->mb.i_partition = D_8x16;
1238     a->i_cost8x16bi = 0;
1239
1240     for( i = 0; i < 2; i++ )
1241     {
1242         int i_part_cost;
1243         int i_part_cost_bi = 0;
1244
1245         for( l = 0; l < 2; l++ )
1246         {
1247             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1248             x264_me_t *m = &lX->me8x16[i];
1249
1250             m->i_pixel = PIXEL_8x16;
1251             m->p_cost_mv = a->p_cost_mv;
1252
1253             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1254             LOAD_HPELS( m, p_fref[l], 8*i, 0 );
1255
1256             mvc[0][0] = lX->me8x8[i].mv[0];
1257             mvc[0][1] = lX->me8x8[i].mv[1];
1258             mvc[1][0] = lX->me8x8[i+2].mv[0];
1259             mvc[1][1] = lX->me8x8[i+2].mv[1];
1260
1261             x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1262             x264_me_search( h, m, mvc, 2 );
1263
1264             /* BI mode */
1265             h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1266                             m->mv[0], m->mv[1], 8, 16 );
1267             /* FIXME: ref cost */
1268             i_part_cost_bi += m->cost_mv;
1269         }
1270
1271         WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
1272         i_part_cost_bi += h->pixf.satd[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
1273
1274         i_part_cost = a->l0.me8x16[i].cost;
1275         a->i_mb_partition8x16[i] = D_L0_8x8;
1276         if( a->l1.me8x16[i].cost < i_part_cost )
1277         {
1278             i_part_cost = a->l1.me8x16[i].cost;
1279             a->i_mb_partition8x16[i] = D_L1_8x8;
1280         }
1281         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1282         {
1283             i_part_cost = i_part_cost_bi;
1284             a->i_mb_partition8x16[i] = D_BI_8x8;
1285         }
1286         a->i_cost8x16bi += i_part_cost;
1287
1288         if( i == 0 )
1289             x264_mb_cache_mv_b8x16( h, a, i, 0 );
1290     }
1291
1292     /* mb type cost */
1293     a->i_mb_type8x16 = B_L0_L0
1294         + (a->i_mb_partition8x16[0]>>2) * 3
1295         + (a->i_mb_partition8x16[1]>>2);
1296     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1297 }
1298
1299 /*****************************************************************************
1300  * x264_macroblock_analyse:
1301  *****************************************************************************/
1302 void x264_macroblock_analyse( x264_t *h )
1303 {
1304     x264_mb_analysis_t analysis;
1305     int i;
1306
1307     h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp(h);
1308
1309     /* prevent QP from varying too fast. FIXME what's a sane limit? */
1310     h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->mb.qp[h->mb.i_mb_xy],
1311                                           h->mb.i_last_qp - 12, h->mb.i_last_qp + 12 );
1312
1313     /* init analysis */
1314     x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
1315
1316     /*--------------------------- Do the analysis ---------------------------*/
1317     if( h->sh.i_type == SLICE_TYPE_I )
1318     {
1319         x264_mb_analyse_intra( h, &analysis, COST_MAX );
1320
1321         if( analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
1322             h->mb.i_type = I_4x4;
1323         else
1324             h->mb.i_type = I_16x16;
1325     }
1326     else if( h->sh.i_type == SLICE_TYPE_P )
1327     {
1328         const unsigned int i_neighbour = h->mb.i_neighbour;
1329
1330         int b_skip = 0;
1331         int i_cost;
1332         int i_intra_cost, i_intra_type;
1333
1334         /* Fast P_SKIP detection */
1335         if( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
1336             ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
1337             ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
1338             ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) )
1339         {
1340             b_skip = x264_macroblock_probe_pskip( h );
1341         }
1342
1343         if( b_skip )
1344         {
1345             h->mb.i_type = P_SKIP;
1346             h->mb.i_partition = D_16x16;
1347         }
1348         else
1349         {
1350             const unsigned int flags = h->param.analyse.inter;
1351             int i_type;
1352             int i_partition;
1353
1354             x264_mb_analyse_load_costs( h, &analysis );
1355
1356             x264_mb_analyse_inter_p16x16( h, &analysis );
1357             if( flags & X264_ANALYSE_PSUB16x16 )
1358                 x264_mb_analyse_inter_p8x8( h, &analysis );
1359
1360             /* Select best inter mode */
1361             i_type = P_L0;
1362             i_partition = D_16x16;
1363             i_cost = analysis.l0.me16x16.cost;
1364
1365             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1366                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
1367             {
1368                 int i;
1369
1370                 i_type = P_8x8;
1371                 i_partition = D_8x8;
1372                 h->mb.i_sub_partition[0] = D_L0_8x8;
1373                 h->mb.i_sub_partition[1] = D_L0_8x8;
1374                 h->mb.i_sub_partition[2] = D_L0_8x8;
1375                 h->mb.i_sub_partition[3] = D_L0_8x8;
1376
1377                 i_cost = analysis.l0.i_cost8x8;
1378
1379                 /* Do sub 8x8 */
1380                 if( flags & X264_ANALYSE_PSUB8x8 )
1381                 {
1382                     for( i = 0; i < 4; i++ )
1383                     {
1384                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
1385                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
1386                         {
1387                             int i_cost8x8;
1388
1389                             h->mb.i_sub_partition[i] = D_L0_4x4;
1390                             i_cost8x8 = analysis.l0.i_cost4x4[i];
1391
1392                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
1393                             if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
1394                             {
1395                                 h->mb.i_sub_partition[i] = D_L0_8x4;
1396                                 i_cost8x8 = analysis.l0.i_cost8x4[i];
1397                             }
1398
1399                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
1400                             if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
1401                             {
1402                                 h->mb.i_sub_partition[i] = D_L0_4x8;
1403                                 i_cost8x8 = analysis.l0.i_cost4x8[i];
1404                             }
1405
1406                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
1407                         }
1408                     }
1409                 }
1410
1411                 /* Now do sub 16x8/8x16 */
1412                 x264_mb_analyse_inter_p16x8( h, &analysis );
1413                 if( analysis.l0.i_cost16x8 < i_cost )
1414                 {
1415                     i_type = P_L0;
1416                     i_partition = D_16x8;
1417                     i_cost = analysis.l0.i_cost16x8;
1418                 }
1419
1420                 x264_mb_analyse_inter_p8x16( h, &analysis );
1421                 if( analysis.l0.i_cost8x16 < i_cost )
1422                 {
1423                     i_type = P_L0;
1424                     i_partition = D_8x16;
1425                     i_cost = analysis.l0.i_cost8x16;
1426                 }
1427             }
1428
1429             h->mb.i_type = i_type;
1430             h->mb.i_partition = i_partition;
1431
1432             /* refine qpel */
1433             if( h->mb.i_partition == D_16x16 )
1434             {
1435                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1436                 i_cost = analysis.l0.me16x16.cost;
1437             }
1438             else if( h->mb.i_partition == D_16x8 )
1439             {
1440                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
1441                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
1442                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
1443             }
1444             else if( h->mb.i_partition == D_8x16 )
1445             {
1446                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
1447                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
1448                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
1449             }
1450             else if( h->mb.i_partition == D_8x8 )
1451             {
1452                 int i8x8;
1453                 i_cost = 0;
1454                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
1455                 {
1456                     switch( h->mb.i_sub_partition[i8x8] )
1457                     {
1458                         case D_L0_8x8:
1459                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
1460                             i_cost += analysis.l0.me8x8[i8x8].cost;
1461                             break;
1462                         case D_L0_8x4:
1463                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
1464                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
1465                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
1466                                       analysis.l0.me8x4[i8x8][1].cost;
1467                             break;
1468                         case D_L0_4x8:
1469                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
1470                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
1471                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
1472                                       analysis.l0.me4x8[i8x8][1].cost;
1473                             break;
1474
1475                         case D_L0_4x4:
1476                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
1477                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
1478                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
1479                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
1480                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
1481                                       analysis.l0.me4x4[i8x8][1].cost +
1482                                       analysis.l0.me4x4[i8x8][2].cost +
1483                                       analysis.l0.me4x4[i8x8][3].cost;
1484                             break;
1485                         default:
1486                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
1487                             break;
1488                     }
1489                 }
1490             }
1491
1492             x264_mb_analyse_intra( h, &analysis, i_cost );
1493             if( h->mb.b_chroma_me &&
1494                 ( analysis.i_sad_i16x16 < i_cost
1495              || ( analysis.i_sad_i4x4 < i_cost )))
1496             {
1497                 x264_mb_analyse_intra_chroma( h, &analysis );
1498                 analysis.i_sad_i16x16 += analysis.i_sad_i8x8;
1499                 analysis.i_sad_i4x4 += analysis.i_sad_i8x8;
1500             }
1501
1502             i_intra_type = I_16x16;
1503             i_intra_cost = analysis.i_sad_i16x16;
1504
1505             if( analysis.i_sad_i4x4 < i_intra_cost )
1506             {
1507                 i_intra_type = I_4x4;
1508                 i_intra_cost = analysis.i_sad_i4x4;
1509             }
1510
1511             if( i_intra_cost < i_cost )
1512             {
1513                 h->mb.i_type = i_intra_type;
1514                 i_cost = i_intra_cost;
1515             }
1516
1517             h->stat.frame.i_intra_cost += i_intra_cost;
1518             h->stat.frame.i_inter_cost += i_cost;
1519         }
1520     }
1521     else if( h->sh.i_type == SLICE_TYPE_B )
1522     {
1523         int b_skip = 0;
1524
1525         analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h );
1526         if( analysis.b_direct_available )
1527         {
1528             h->mb.i_type = B_SKIP;
1529             x264_mb_mc( h );
1530
1531             /* Conditioning the probe on neighboring block types
1532              * doesn't seem to help speed or quality. */
1533             b_skip = x264_macroblock_probe_bskip( h );
1534         }
1535
1536         if( !b_skip )
1537         {
1538             const unsigned int flags = h->param.analyse.inter;
1539             int i_partition;
1540             int i_cost;
1541
1542             x264_mb_analyse_load_costs( h, &analysis );
1543
1544             /* select best inter mode */
1545             /* direct must be first */
1546             if( analysis.b_direct_available )
1547                 x264_mb_analyse_inter_direct( h, &analysis );
1548
1549             x264_mb_analyse_inter_b16x16( h, &analysis );
1550
1551             h->mb.i_type = B_L0_L0;
1552             i_partition = D_16x16;
1553             i_cost = analysis.l0.me16x16.cost;
1554             if( analysis.l1.me16x16.cost < i_cost )
1555             {
1556                 h->mb.i_type = B_L1_L1;
1557                 i_cost = analysis.l1.me16x16.cost;
1558             }
1559             if( analysis.i_cost16x16bi < i_cost )
1560             {
1561                 h->mb.i_type = B_BI_BI;
1562                 i_cost = analysis.i_cost16x16bi;
1563             }
1564             if( analysis.i_cost16x16direct < i_cost )
1565             {
1566                 h->mb.i_type = B_DIRECT;
1567                 i_cost = analysis.i_cost16x16direct;
1568             }
1569
1570             if( flags & X264_ANALYSE_BSUB16x16 )
1571             {
1572                 x264_mb_analyse_inter_b8x8( h, &analysis );
1573                 if( analysis.i_cost8x8bi < i_cost )
1574                 {
1575                     h->mb.i_type = B_8x8;
1576                     i_partition = D_8x8;
1577                     i_cost = analysis.i_cost8x8bi;
1578
1579                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
1580                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
1581                     {
1582                         x264_mb_analyse_inter_b16x8( h, &analysis );
1583                         if( analysis.i_cost16x8bi < i_cost )
1584                         {
1585                             i_partition = D_16x8;
1586                             i_cost = analysis.i_cost16x8bi;
1587                             h->mb.i_type = analysis.i_mb_type16x8;
1588                         }
1589                     }
1590                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
1591                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
1592                     {
1593                         x264_mb_analyse_inter_b8x16( h, &analysis );
1594                         if( analysis.i_cost8x16bi < i_cost )
1595                         {
1596                             i_partition = D_8x16;
1597                             i_cost = analysis.i_cost8x16bi;
1598                             h->mb.i_type = analysis.i_mb_type8x16;
1599                         }
1600                     }
1601                 }
1602             }
1603
1604             h->mb.i_partition = i_partition;
1605
1606             /* refine qpel */
1607             if( i_partition == D_16x16 )
1608             {
1609                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1610                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1611                 if( h->mb.i_type == B_L0_L0 )
1612                 {
1613                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1614                     i_cost = analysis.l0.me16x16.cost
1615                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1616                 }
1617                 else if( h->mb.i_type == B_L1_L1 )
1618                 {
1619                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1620                     i_cost = analysis.l1.me16x16.cost
1621                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1622                 }
1623                 else if( h->mb.i_type == B_BI_BI )
1624                 {
1625                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1626                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1627                 }
1628             }
1629             else if( i_partition == D_16x8 )
1630             {
1631                 for( i=0; i<2; i++ )
1632                 {
1633                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
1634                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
1635                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
1636                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
1637                 }
1638             }
1639             else if( i_partition == D_8x16 )
1640             {
1641                 for( i=0; i<2; i++ )
1642                 {
1643                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
1644                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
1645                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
1646                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
1647                 }
1648             }
1649             else if( i_partition == D_8x8 )
1650             {
1651                 for( i=0; i<4; i++ )
1652                 {
1653                     x264_me_t *m;
1654                     int i_part_cost_old;
1655                     int i_type_cost;
1656                     int i_part_type = h->mb.i_sub_partition[i];
1657                     int b_bidir = (i_part_type == D_BI_8x8);
1658
1659                     if( i_part_type == D_DIRECT_8x8 )
1660                         continue;
1661                     if( x264_mb_partition_listX_table[0][i_part_type] )
1662                     {
1663                         m = &analysis.l0.me8x8[i];
1664                         i_part_cost_old = m->cost;
1665                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1666                         m->cost -= i_type_cost;
1667                         x264_me_refine_qpel( h, m );
1668                         if( !b_bidir )
1669                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1670                     }
1671                     if( x264_mb_partition_listX_table[1][i_part_type] )
1672                     {
1673                         m = &analysis.l1.me8x8[i];
1674                         i_part_cost_old = m->cost;
1675                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1676                         m->cost -= i_type_cost;
1677                         x264_me_refine_qpel( h, m );
1678                         if( !b_bidir )
1679                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1680                     }
1681                     /* TODO: update mvp? */
1682                 }
1683             }
1684
1685             /* best intra mode */
1686             x264_mb_analyse_intra( h, &analysis, i_cost );
1687
1688             if( analysis.i_sad_i16x16 < i_cost )
1689             {
1690                 h->mb.i_type = I_16x16;
1691                 i_cost = analysis.i_sad_i16x16;
1692             }
1693             if( analysis.i_sad_i4x4 < i_cost )
1694             {
1695                 h->mb.i_type = I_4x4;
1696                 i_cost = analysis.i_sad_i4x4;
1697             }
1698         }
1699     }
1700
1701     /*-------------------- Update MB from the analysis ----------------------*/
1702     h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
1703     switch( h->mb.i_type )
1704     {
1705         case I_4x4:
1706             for( i = 0; i < 16; i++ )
1707             {
1708                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
1709                     analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
1710             }
1711
1712             x264_mb_analyse_intra_chroma( h, &analysis );
1713             h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1714             break;
1715         case I_16x16:
1716             h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
1717
1718             x264_mb_analyse_intra_chroma( h, &analysis );
1719             h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1720             break;
1721
1722         case P_L0:
1723             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1724             switch( h->mb.i_partition )
1725             {
1726                 case D_16x16:
1727                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1728                     break;
1729
1730                 case D_16x8:
1731                     x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
1732                     x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
1733                     break;
1734
1735                 case D_8x16:
1736                     x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
1737                     x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
1738                     break;
1739
1740                 default:
1741                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
1742                     break;
1743             }
1744             break;
1745
1746         case P_8x8:
1747             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1748             for( i = 0; i < 4; i++ )
1749             {
1750                 const int x = 2*(i%2);
1751                 const int y = 2*(i/2);
1752
1753                 switch( h->mb.i_sub_partition[i] )
1754                 {
1755                     case D_L0_8x8:
1756                         x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
1757                         break;
1758                     case D_L0_8x4:
1759                         x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
1760                         x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
1761                         break;
1762                     case D_L0_4x8:
1763                         x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
1764                         x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
1765                         break;
1766                     case D_L0_4x4:
1767                         x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
1768                         x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
1769                         x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
1770                         x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
1771                         break;
1772                     default:
1773                         x264_log( h, X264_LOG_ERROR, "internal error\n" );
1774                         break;
1775                 }
1776             }
1777             break;
1778
1779         case P_SKIP:
1780         {
1781             int mvp[2];
1782             x264_mb_predict_mv_pskip( h, mvp );
1783             /* */
1784             h->mb.i_partition = D_16x16;
1785             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
1786             x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
1787             break;
1788         }
1789
1790         case B_SKIP:
1791             /* nothing has changed since x264_macroblock_probe_bskip */
1792             break;
1793         case B_DIRECT:
1794             x264_mb_load_mv_direct8x8( h, 0 );
1795             x264_mb_load_mv_direct8x8( h, 1 );
1796             x264_mb_load_mv_direct8x8( h, 2 );
1797             x264_mb_load_mv_direct8x8( h, 3 );
1798             break;
1799
1800         case B_8x8:
1801             /* optimize: cache might not need to be rewritten */
1802             for( i = 0; i < 4; i++ )
1803                 x264_mb_cache_mv_b8x8( h, &analysis, i, 1 );
1804             break;
1805
1806         default: /* the rest of the B types */
1807             switch( h->mb.i_partition )
1808             {
1809             case D_16x16:
1810                 switch( h->mb.i_type )
1811                 {
1812                 case B_L0_L0:
1813                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1814                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1815
1816                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
1817                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
1818                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
1819                     break;
1820                 case B_L1_L1:
1821                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
1822                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
1823                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
1824
1825                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1826                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1827                     break;
1828                 case B_BI_BI:
1829                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1830                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1831
1832                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1833                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1834                     break;
1835                 }
1836                 break;
1837             case D_16x8:
1838                 x264_mb_cache_mv_b16x8( h, &analysis, 0, 1 );
1839                 x264_mb_cache_mv_b16x8( h, &analysis, 1, 1 );
1840                 break;
1841             case D_8x16:
1842                 x264_mb_cache_mv_b8x16( h, &analysis, 0, 1 );
1843                 x264_mb_cache_mv_b8x16( h, &analysis, 1, 1 );
1844                 break;
1845             default:
1846                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
1847                 break;
1848             }
1849     }
1850 }
1851
1852 #include "slicetype_decision.c"
1853