git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int       i_rd16x16;
  41     x264_me_t me16x16;
  42     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  43
  44     /* 8x8 */
  45     int       i_cost8x8;
  46     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  47     ALIGNED_4( int16_t mvc[32][5][2] );
  48     x264_me_t me8x8[4];
  49
  50     /* Sub 4x4 */
  51     int       i_cost4x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me4x4[4][4];
  53
  54     /* Sub 8x4 */
  55     int       i_cost8x4[4]; /* cost per 8x8 partition */
  56     x264_me_t me8x4[4][2];
  57
  58     /* Sub 4x8 */
  59     int       i_cost4x8[4]; /* cost per 8x8 partition */
  60     x264_me_t me4x8[4][2];
  61
  62     /* 16x8 */
  63     int       i_cost16x8;
  64     x264_me_t me16x8[2];
  65
  66     /* 8x16 */
  67     int       i_cost8x16;
  68     x264_me_t me8x16[2];
  69
  70 } x264_mb_analysis_list_t;
  71
  72 typedef struct
  73 {
  74     /* conduct the analysis using this lamda and QP */
  75     int i_lambda;
  76     int i_lambda2;
  77     int i_qp;
  78     uint16_t *p_cost_mv;
  79     uint16_t *p_cost_ref[2];
  80     int i_mbrd;
  81
  82
  83     /* I: Intra part */
  84     /* Take some shortcuts in intra search if intra is deemed unlikely */
  85     int b_fast_intra;
  86     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  87     int b_try_pskip;
  88
  89     /* Luma part */
  90     int i_satd_i16x16;
  91     int i_satd_i16x16_dir[7];
  92     int i_predict16x16;
  93
  94     int i_satd_i8x8;
  95     int i_cbp_i8x8_luma;
  96     int i_satd_i8x8_dir[12][4];
  97     int i_predict8x8[4];
  98
  99     int i_satd_i4x4;
 100     int i_predict4x4[16];
 101
 102     int i_satd_pcm;
 103
 104     /* Chroma part */
 105     int i_satd_i8x8chroma;
 106     int i_satd_i8x8chroma_dir[7];
 107     int i_predict8x8chroma;
 108
 109     /* II: Inter part P/B frame */
 110     x264_mb_analysis_list_t l0;
 111     x264_mb_analysis_list_t l1;
 112
 113     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 114     int i_cost16x16direct;
 115     int i_cost8x8bi;
 116     int i_cost8x8direct[4];
 117     int i_cost16x8bi;
 118     int i_cost8x16bi;
 119     int i_rd16x16bi;
 120     int i_rd16x16direct;
 121     int i_rd16x8bi;
 122     int i_rd8x16bi;
 123     int i_rd8x8bi;
 124
 125     int i_mb_partition16x8[2]; /* mb_partition_e */
 126     int i_mb_partition8x16[2];
 127     int i_mb_type16x8; /* mb_class_e */
 128     int i_mb_type8x16;
 129
 130     int b_direct_available;
 131
 132 } x264_mb_analysis_t;
 133
 134 /* lambda = pow(2,qp/6-2) */
 135 const uint8_t x264_lambda_tab[52] = {
 136    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 137    1, 1, 1, 1,              /*  8-11 */
 138    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 139    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 140    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 141   16,18,20,23,25,29,32,36,  /* 36-43 */
 142   40,45,51,57,64,72,81,91   /* 44-51 */
 143 };
 144
 145 /* lambda2 = pow(lambda,2) * .9 * 256 */
 146 const int x264_lambda2_tab[52] = {
 147     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 148     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 149    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 150   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 151  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 152 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 153 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 154 };
 155
 156 const uint8_t x264_exp2_lut[64] = {
 157       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 158      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 159     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 160     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 161 };
 162
 163 const float x264_log2_lut[128] = {
 164     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 165     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 166     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 167     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 168     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 169     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 170     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 171     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 172     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 173     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 174     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 175     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 176     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 177     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 178     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 179     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 180 };
 181
 182 /* Avoid an int/float conversion. */
 183 const float x264_log2_lz_lut[32] = {
 184     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 185 };
 186
 187 // should the intra and inter lambdas be different?
 188 // I'm just matching the behaviour of deadzone quant.
 189 static const int x264_trellis_lambda2_tab[2][52] = {
 190     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 191     {    46,      58,      73,      92,     117,     147,
 192         185,     233,     294,     370,     466,     587,
 193         740,     932,    1174,    1480,    1864,    2349,
 194        2959,    3728,    4697,    5918,    7457,    9395,
 195       11837,   14914,   18790,   23674,   29828,   37581,
 196       47349,   59656,   75163,   94699,  119313,  150326,
 197      189399,  238627,  300652,  378798,  477255,  601304,
 198      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 199     3030384, 3818045, 4810435, 6060769 },
 200     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 201     {    27,      34,      43,      54,      68,      86,
 202         108,     136,     172,     216,     273,     343,
 203         433,     545,     687,     865,    1090,    1374,
 204        1731,    2180,    2747,    3461,    4361,    5494,
 205        6922,    8721,   10988,   13844,   17442,   21976,
 206       27688,   34885,   43953,   55377,   69771,   87906,
 207      110755,  139543,  175813,  221511,  279087,  351627,
 208      443023,  558174,  703255,  886046, 1116348, 1406511,
 209     1772093, 2232697, 2813022, 3544186 }
 210 };
 211
 212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 213        16,    20,    25,    32,    40,    50,
 214        64,    80,   101,   128,   161,   203,
 215       256,   322,   406,   512,   645,   812,
 216      1024,  1290,  1625,  2048,  2580,  3250,
 217      4096,  5160,  6501,  8192, 10321, 13003,
 218     16384, 20642, 26007, 32768, 41285, 52015,
 219     65535
 220 };
 221
 222 /* TODO: calculate CABAC costs */
 223 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 224     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 225 };
 226 static const uint8_t i_mb_b16x8_cost_table[17] = {
 227     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 228 };
 229 static const uint8_t i_sub_mb_b_cost_table[13] = {
 230     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 231 };
 232 static const uint8_t i_sub_mb_p_cost_table[4] = {
 233     5, 3, 3, 1
 234 };
 235
 236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 237
 238 static uint16_t x264_cost_ref[92][3][33];
 239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 240
 241 int x264_analyse_init_costs( x264_t *h, int qp )
 242 {
 243     int i, j;
 244     int lambda = x264_lambda_tab[qp];
 245     if( h->cost_mv[lambda] )
 246         return 0;
 247     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 248     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 249     h->cost_mv[lambda] += 2*4*2048;
 250     for( i = 0; i <= 2*4*2048; i++ )
 251     {
 252         h->cost_mv[lambda][-i] =
 253         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 254     }
 255     x264_pthread_mutex_lock( &cost_ref_mutex );
 256     for( i = 0; i < 3; i++ )
 257         for( j = 0; j < 33; j++ )
 258             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 259     x264_pthread_mutex_unlock( &cost_ref_mutex );
 260     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 261     {
 262         for( j=0; j<4; j++ )
 263         {
 264             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 265             h->cost_mv_fpel[lambda][j] += 2*2048;
 266             for( i = -2*2048; i < 2*2048; i++ )
 267                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 268         }
 269     }
 270     return 0;
 271 fail:
 272     return -1;
 273 }
 274
 275 void x264_analyse_free_costs( x264_t *h )
 276 {
 277     int i, j;
 278     for( i = 0; i < 92; i++ )
 279     {
 280         if( h->cost_mv[i] )
 281             x264_free( h->cost_mv[i] - 2*4*2048 );
 282         if( h->cost_mv_fpel[i][0] )
 283             for( j = 0; j < 4; j++ )
 284                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 285     }
 286 }
 287
 288 void x264_analyse_weight_frame( x264_t *h, int end )
 289 {
 290     int j;
 291     for( j=0; j<h->i_ref0; j++ )
 292     {
 293         if( h->sh.weight[j][0].weightfn )
 294         {
 295             x264_frame_t *frame = h->fref0[j];
 296             int width = frame->i_width[0] + 2*PADH;
 297             int i_padv = PADV << h->param.b_interlaced;
 298             int offset, height;
 299             uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 300             int k;
 301             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 302             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 303             h->fenc->i_lines_weighted += height;
 304             if( height )
 305             {
 306                 for( k = j; k < h->i_ref0; k++ )
 307                     if( h->sh.weight[k][0].weightfn )
 308                     {
 309                         uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 310                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 311                                                  src + offset, frame->i_stride[0],
 312                                                  width, height, &h->sh.weight[k][0] );
 313                     }
 314             }
 315             break;
 316         }
 317     }
 318 }
 319
 320 /* initialize an array of lambda*nbits for all possible mvs */
 321 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 322 {
 323     a->p_cost_mv = h->cost_mv[a->i_lambda];
 324     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 325     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 326 }
 327
 328 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 329 {
 330     /* conduct the analysis using this lamda and QP */
 331     a->i_qp = h->mb.i_qp = i_qp;
 332     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 333
 334     a->i_lambda = x264_lambda_tab[i_qp];
 335     a->i_lambda2 = x264_lambda2_tab[i_qp];
 336
 337     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 338     if( h->param.analyse.i_trellis )
 339     {
 340         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 341         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 342         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 343         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 344     }
 345     h->mb.i_psy_rd_lambda = a->i_lambda;
 346     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 347     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 348
 349 }
 350
 351 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 352 {
 353     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 354
 355     /* mbrd == 1 -> RD mode decision */
 356     /* mbrd == 2 -> RD refinement */
 357     /* mbrd == 3 -> QPRD */
 358     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 359
 360     x264_mb_analyse_init_qp( h, a, i_qp );
 361
 362     h->mb.i_me_method = h->param.analyse.i_me_method;
 363     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 364     if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
 365         h->mb.i_subpel_refine--;
 366     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 367                         && h->mb.i_subpel_refine >= 5;
 368     h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
 369                           (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
 370
 371     h->mb.b_transform_8x8 = 0;
 372     h->mb.b_noise_reduction = 0;
 373
 374     /* I: Intra part */
 375     a->i_satd_i16x16 =
 376     a->i_satd_i8x8   =
 377     a->i_satd_i4x4   =
 378     a->i_satd_i8x8chroma = COST_MAX;
 379
 380     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 381     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 382
 383     a->b_fast_intra = 0;
 384     h->mb.i_skip_intra =
 385         h->mb.b_lossless ? 0 :
 386         a->i_mbrd ? 2 :
 387         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 388
 389     /* II: Inter part P/B frame */
 390     if( h->sh.i_type != SLICE_TYPE_I )
 391     {
 392         int i, j;
 393         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 394         // limit motion search to a slightly smaller range than the theoretical limit,
 395         // since the search may go a few iterations past its given range
 396         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 397
 398         /* Calculate max allowed MV range */
 399 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 400         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 401         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 402         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 403         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 404         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 405         {
 406             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 407             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 408             /* If we're left of the refresh bar, don't reference right of it. */
 409             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 410                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 411         }
 412         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 413         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 414         if( h->mb.i_mb_x == 0 )
 415         {
 416             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 417             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 418             int thread_mvy_range = i_fmv_range;
 419
 420             if( h->i_thread_frames > 1 )
 421             {
 422                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 423                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 424                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 425                 {
 426                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 427                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 428                     for( j=0; j<i_ref; j++ )
 429                     {
 430                         x264_frame_cond_wait( fref[j]->orig, thresh );
 431                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 432                     }
 433                 }
 434
 435                 if( h->param.b_deterministic )
 436                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 437                 if( h->mb.b_interlaced )
 438                     thread_mvy_range >>= 1;
 439
 440                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 441             }
 442
 443             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 444             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 445             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 446             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 447             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 448             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 449             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 450         }
 451 #undef CLIP_FMV
 452
 453         a->l0.me16x16.cost =
 454         a->l0.i_rd16x16    =
 455         a->l0.i_cost8x8    = COST_MAX;
 456
 457         for( i = 0; i < 4; i++ )
 458         {
 459             a->l0.i_cost4x4[i] =
 460             a->l0.i_cost8x4[i] =
 461             a->l0.i_cost4x8[i] = COST_MAX;
 462         }
 463
 464         a->l0.i_cost16x8   =
 465         a->l0.i_cost8x16   = COST_MAX;
 466         if( h->sh.i_type == SLICE_TYPE_B )
 467         {
 468             a->l1.me16x16.cost =
 469             a->l1.i_rd16x16    =
 470             a->l1.i_cost8x8    = COST_MAX;
 471
 472             for( i = 0; i < 4; i++ )
 473             {
 474                 a->l1.i_cost4x4[i] =
 475                 a->l1.i_cost8x4[i] =
 476                 a->l1.i_cost4x8[i] =
 477                 a->i_cost8x8direct[i] = COST_MAX;
 478             }
 479
 480             a->l1.i_cost16x8   =
 481             a->l1.i_cost8x16   =
 482             a->i_rd16x16bi     =
 483             a->i_rd16x16direct =
 484             a->i_rd8x8bi       =
 485             a->i_rd16x8bi      =
 486             a->i_rd8x16bi      =
 487             a->i_cost16x16bi   =
 488             a->i_cost16x16direct =
 489             a->i_cost8x8bi     =
 490             a->i_cost16x8bi    =
 491             a->i_cost8x16bi    = COST_MAX;
 492         }
 493
 494         /* Fast intra decision */
 495         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 496         {
 497             if(   IS_INTRA( h->mb.i_mb_type_left )
 498                || IS_INTRA( h->mb.i_mb_type_top )
 499                || IS_INTRA( h->mb.i_mb_type_topleft )
 500                || IS_INTRA( h->mb.i_mb_type_topright )
 501                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 502                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 503             { /* intra is likely */ }
 504             else
 505             {
 506                 a->b_fast_intra = 1;
 507             }
 508         }
 509         h->mb.b_skip_mc = 0;
 510         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 511             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 512         {
 513             a->b_force_intra = 1;
 514             a->b_fast_intra = 0;
 515         }
 516         else
 517             a->b_force_intra = 0;
 518     }
 519 }
 520
 521 /* Prediction modes allowed for various combinations of neighbors. */
 522 /* Terminated by a -1. */
 523 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 524 static const int8_t i16x16_mode_available[5][5] =
 525 {
 526     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 527     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 528     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 529     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 530     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 531 };
 532
 533 static const int8_t i8x8chroma_mode_available[5][5] =
 534 {
 535     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 536     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 537     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 538     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 539     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 540 };
 541
 542 static const int8_t i4x4_mode_available[5][10] =
 543 {
 544     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 545     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 546     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 547     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 548     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 549 };
 550
 551 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
 552 {
 553     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 554     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 555 }
 556
 557 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 558 {
 559     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 560     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 561 }
 562
 563 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
 564 {
 565     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 566     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 567 }
 568
 569 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 570 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 571 {
 572     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 573
 574     if( do_both_dct || h->mb.b_transform_8x8 )
 575         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 576     if( do_both_dct || !h->mb.b_transform_8x8 )
 577         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 578 }
 579
 580 /* Reset fenc satd scores cache for psy RD */
 581 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 582 {
 583     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 584         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 585     if( !h->mb.i_psy_rd )
 586         return;
 587     /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
 588     h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
 589     if( b_satd )
 590         h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 591 }
 592
 593 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 594 {
 595     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 596
 597     if( a->i_satd_i8x8chroma < COST_MAX )
 598         return;
 599
 600     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 601
 602     /* 8x8 prediction selection for chroma */
 603     if( predict_mode[3] >= 0 && b_merged_satd )
 604     {
 605         int satdu[4], satdv[4];
 606         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 607         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 608         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 609         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 610         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 611         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 612
 613         for( ; *predict_mode >= 0; predict_mode++ )
 614         {
 615             int i_mode = *predict_mode;
 616             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 617
 618             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 619             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 620         }
 621     }
 622     else
 623     {
 624         for( ; *predict_mode >= 0; predict_mode++ )
 625         {
 626             int i_satd;
 627             int i_mode = *predict_mode;
 628
 629             /* we do the prediction */
 630             if( h->mb.b_lossless )
 631                 x264_predict_lossless_8x8_chroma( h, i_mode );
 632             else
 633             {
 634                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 635                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 636             }
 637
 638             /* we calculate the cost */
 639             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 640                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 641                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 642
 643             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 644             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 645         }
 646     }
 647
 648     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 649 }
 650
 651 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 652 {
 653     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 654     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 655     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 656
 657     int i, idx;
 658     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 659
 660     /*---------------- Try all mode and calculate their score ---------------*/
 661
 662     /* 16x16 prediction selection */
 663     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 664
 665     if( b_merged_satd && predict_mode[3] >= 0 )
 666     {
 667         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 668         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 669         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 670             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 671         for( i=0; i<4; i++ )
 672         {
 673             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 674             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 675         }
 676     }
 677     else
 678     {
 679         for( ; *predict_mode >= 0; predict_mode++ )
 680         {
 681             int i_satd;
 682             int i_mode = *predict_mode;
 683
 684             if( h->mb.b_lossless )
 685                 x264_predict_lossless_16x16( h, i_mode );
 686             else
 687                 h->predict_16x16[i_mode]( p_dst );
 688
 689             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 690                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 691             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 692             a->i_satd_i16x16_dir[i_mode] = i_satd;
 693         }
 694     }
 695
 696     if( h->sh.i_type == SLICE_TYPE_B )
 697         /* cavlc mb type prefix */
 698         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 699     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 700         return;
 701
 702     /* 8x8 prediction selection */
 703     if( flags & X264_ANALYSE_I8x8 )
 704     {
 705         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 706         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 707         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 708         int i_cost = 0;
 709         h->mb.i_cbp_luma = 0;
 710         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 711
 712         // FIXME some bias like in i4x4?
 713         if( h->sh.i_type == SLICE_TYPE_B )
 714             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 715
 716         for( idx = 0;; idx++ )
 717         {
 718             int x = idx&1;
 719             int y = idx>>1;
 720             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 721             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 722             int i_best = COST_MAX;
 723             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 724
 725             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 726             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 727
 728             if( b_merged_satd && predict_mode[8] >= 0 )
 729             {
 730                 int satd[9];
 731                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 732                 satd[i_pred_mode] -= 3 * a->i_lambda;
 733                 for( i=2; i>=0; i-- )
 734                 {
 735                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 736                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 737                 }
 738                 predict_mode += 3;
 739             }
 740
 741             for( ; *predict_mode >= 0; predict_mode++ )
 742             {
 743                 int i_satd;
 744                 int i_mode = *predict_mode;
 745
 746                 if( h->mb.b_lossless )
 747                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 748                 else
 749                     h->predict_8x8[i_mode]( p_dst_by, edge );
 750
 751                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
 752                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 753                     i_satd -= a->i_lambda * 3;
 754
 755                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 756                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 757             }
 758             i_cost += i_best;
 759
 760             if( idx == 3 || i_cost > i_satd_thresh )
 761                 break;
 762
 763             /* we need to encode this block now (for next ones) */
 764             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 765             x264_mb_encode_i8x8( h, idx, a->i_qp );
 766
 767             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 768         }
 769
 770         if( idx == 3 )
 771         {
 772             a->i_satd_i8x8 = i_cost;
 773             if( h->mb.i_skip_intra )
 774             {
 775                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 776                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 777                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 778                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 779                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 780                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 781                 if( h->mb.i_skip_intra == 2 )
 782                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 783             }
 784         }
 785         else
 786         {
 787             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 788             a->i_satd_i8x8 = COST_MAX;
 789             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 790         }
 791         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 792             return;
 793     }
 794
 795     /* 4x4 prediction selection */
 796     if( flags & X264_ANALYSE_I4x4 )
 797     {
 798         int i_cost;
 799         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 800         h->mb.i_cbp_luma = 0;
 801         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 802         if( a->i_mbrd )
 803             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 804
 805         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 806         if( h->sh.i_type == SLICE_TYPE_B )
 807             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 808
 809         for( idx = 0;; idx++ )
 810         {
 811             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 812             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 813             int i_best = COST_MAX;
 814             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 815
 816             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 817
 818             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 819                 /* emulate missing topright samples */
 820                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 821
 822             if( b_merged_satd && predict_mode[5] >= 0 )
 823             {
 824                 int satd[9];
 825                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 826                 satd[i_pred_mode] -= 3 * a->i_lambda;
 827                 for( i=2; i>=0; i-- )
 828                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 829                 predict_mode += 3;
 830             }
 831
 832             for( ; *predict_mode >= 0; predict_mode++ )
 833             {
 834                 int i_satd;
 835                 int i_mode = *predict_mode;
 836
 837                 if( h->mb.b_lossless )
 838                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 839                 else
 840                     h->predict_4x4[i_mode]( p_dst_by );
 841
 842                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 843                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 844                     i_satd -= a->i_lambda * 3;
 845
 846                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 847             }
 848             i_cost += i_best + 4 * a->i_lambda;
 849
 850             if( i_cost > i_satd_thresh || idx == 15 )
 851                 break;
 852
 853             /* we need to encode this block now (for next ones) */
 854             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 855             x264_mb_encode_i4x4( h, idx, a->i_qp );
 856
 857             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 858         }
 859         if( idx == 15 )
 860         {
 861             a->i_satd_i4x4 = i_cost;
 862             if( h->mb.i_skip_intra )
 863             {
 864                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 865                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 866                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 867                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 868                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 869                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 870                 if( h->mb.i_skip_intra == 2 )
 871                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 872             }
 873         }
 874         else
 875             a->i_satd_i4x4 = COST_MAX;
 876     }
 877 }
 878
 879 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 880 {
 881     if( a->i_satd_i16x16 <= i_satd_thresh )
 882     {
 883         h->mb.i_type = I_16x16;
 884         x264_analyse_update_cache( h, a );
 885         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 886     }
 887     else
 888         a->i_satd_i16x16 = COST_MAX;
 889
 890     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 891     {
 892         h->mb.i_type = I_4x4;
 893         x264_analyse_update_cache( h, a );
 894         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 895     }
 896     else
 897         a->i_satd_i4x4 = COST_MAX;
 898
 899     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 900     {
 901         h->mb.i_type = I_8x8;
 902         x264_analyse_update_cache( h, a );
 903         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 904         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 905     }
 906     else
 907         a->i_satd_i8x8 = COST_MAX;
 908 }
 909
 910 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 911 {
 912     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 913
 914     int i, idx, x, y;
 915     int i_mode, i_thresh;
 916     uint64_t i_satd, i_best;
 917     h->mb.i_skip_intra = 0;
 918
 919     if( h->mb.i_type == I_16x16 )
 920     {
 921         int old_pred_mode = a->i_predict16x16;
 922         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 923         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 924         i_best = a->i_satd_i16x16;
 925         for( ; *predict_mode >= 0; predict_mode++ )
 926         {
 927             int i_mode = *predict_mode;
 928             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 929                 continue;
 930             h->mb.i_intra16x16_pred_mode = i_mode;
 931             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 932             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 933         }
 934     }
 935
 936     /* RD selection for chroma prediction */
 937     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 938     if( predict_mode[1] >= 0 )
 939     {
 940         int8_t predict_mode_sorted[4];
 941         int i_max;
 942         i_thresh = a->i_satd_i8x8chroma * 5/4;
 943
 944         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 945         {
 946             i_mode = *predict_mode;
 947             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 948                 predict_mode_sorted[i_max++] = i_mode;
 949         }
 950
 951         if( i_max > 0 )
 952         {
 953             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 954             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 955             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 956              * coefs for the current chroma mode are still around, so we only
 957              * have to recount the bits. */
 958             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 959             for( i = 0; i < i_max; i++ )
 960             {
 961                 i_mode = predict_mode_sorted[i];
 962                 if( h->mb.b_lossless )
 963                     x264_predict_lossless_8x8_chroma( h, i_mode );
 964                 else
 965                 {
 966                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 967                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 968                 }
 969                 /* if we've already found a mode that needs no residual, then
 970                  * probably any mode with a residual will be worse.
 971                  * so avoid dct on the remaining modes to improve speed. */
 972                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 973                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 974             }
 975             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 976             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 977         }
 978     }
 979
 980     if( h->mb.i_type == I_4x4 )
 981     {
 982         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
 983         int i_nnz = 0;
 984         for( idx = 0; idx < 16; idx++ )
 985         {
 986             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 987             i_best = COST_MAX64;
 988
 989             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 990
 991             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 992                 /* emulate missing topright samples */
 993                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 994
 995             for( ; *predict_mode >= 0; predict_mode++ )
 996             {
 997                 i_mode = *predict_mode;
 998                 if( h->mb.b_lossless )
 999                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1000                 else
1001                     h->predict_4x4[i_mode]( p_dst_by );
1002                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1003
1004                 if( i_best > i_satd )
1005                 {
1006                     a->i_predict4x4[idx] = i_mode;
1007                     i_best = i_satd;
1008                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1009                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1010                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1011                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1012                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1013                 }
1014             }
1015
1016             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1017             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1018             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1019             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1020             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1021
1022             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1023         }
1024     }
1025     else if( h->mb.i_type == I_8x8 )
1026     {
1027         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1028         for( idx = 0; idx < 4; idx++ )
1029         {
1030             uint64_t pels_h = 0;
1031             uint8_t pels_v[7];
1032             uint16_t i_nnz[2] = {0}; //shut up gcc
1033             uint8_t *p_dst_by;
1034             int j;
1035             int cbp_luma_new = 0;
1036             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1037
1038             i_best = COST_MAX64;
1039             x = idx&1;
1040             y = idx>>1;
1041
1042             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1043             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1044             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1045
1046             for( ; *predict_mode >= 0; predict_mode++ )
1047             {
1048                 i_mode = *predict_mode;
1049                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1050                     continue;
1051
1052                 if( h->mb.b_lossless )
1053                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1054                 else
1055                     h->predict_8x8[i_mode]( p_dst_by, edge );
1056                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1057                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1058
1059                 if( i_best > i_satd )
1060                 {
1061                     a->i_predict8x8[idx] = i_mode;
1062                     cbp_luma_new = h->mb.i_cbp_luma;
1063                     i_best = i_satd;
1064
1065                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1066                     if( !(idx&1) )
1067                         for( j=0; j<7; j++ )
1068                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1069                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1070                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1071                 }
1072             }
1073             a->i_cbp_i8x8_luma = cbp_luma_new;
1074             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1075             if( !(idx&1) )
1076                 for( j=0; j<7; j++ )
1077                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1078             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1079             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1080
1081             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1082         }
1083     }
1084 }
1085
1086 #define LOAD_FENC( m, src, xoff, yoff) \
1087     (m)->p_cost_mv = a->p_cost_mv; \
1088     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1089     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1090     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1091     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1092     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1093
1094 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1095     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1096     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1097     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1098     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1099     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1100     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1101     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1102     (m)->weight = weight_none; \
1103     (m)->i_ref = ref;
1104
1105 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1106     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1107     (m)->weight = h->sh.weight[i_ref];
1108
1109 #define REF_COST(list, ref) \
1110     (a->p_cost_ref[list][ref])
1111
1112 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1113 {
1114     x264_me_t m;
1115     int i_ref, i_mvc;
1116     ALIGNED_4( int16_t mvc[8][2] );
1117     int i_halfpel_thresh = INT_MAX;
1118     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1119
1120     /* 16x16 Search on all ref frame */
1121     m.i_pixel = PIXEL_16x16;
1122     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1123
1124     a->l0.me16x16.cost = INT_MAX;
1125     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1126     {
1127         m.i_ref_cost = REF_COST( 0, i_ref );
1128         i_halfpel_thresh -= m.i_ref_cost;
1129
1130         /* search with ref */
1131         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1132         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1133
1134         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1135         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1136
1137         if( h->mb.ref_blind_dupe == i_ref )
1138         {
1139             CP32( m.mv, a->l0.mvc[0][0] );
1140             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1141         }
1142         else
1143             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1144
1145         /* early termination
1146          * SSD threshold would probably be better than SATD */
1147         if( i_ref == 0
1148             && a->b_try_pskip
1149             && m.cost-m.cost_mv < 300*a->i_lambda
1150             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1151               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1152             && x264_macroblock_probe_pskip( h ) )
1153         {
1154             h->mb.i_type = P_SKIP;
1155             x264_analyse_update_cache( h, a );
1156             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1157             return;
1158         }
1159
1160         m.cost += m.i_ref_cost;
1161         i_halfpel_thresh += m.i_ref_cost;
1162
1163         if( m.cost < a->l0.me16x16.cost )
1164             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1165
1166         /* save mv for predicting neighbors */
1167         CP32( a->l0.mvc[i_ref][0], m.mv );
1168         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1169     }
1170
1171     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1172     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1173
1174     h->mb.i_type = P_L0;
1175     if( a->i_mbrd )
1176     {
1177         x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1178         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1179         {
1180             h->mb.i_partition = D_16x16;
1181             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1182             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1183             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1184                 h->mb.i_type = P_SKIP;
1185         }
1186     }
1187 }
1188
1189 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1190 {
1191     x264_me_t m;
1192     int i_ref, i;
1193     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1194     int i_maxref = h->mb.pic.i_fref[0]-1;
1195
1196     h->mb.i_partition = D_8x8;
1197
1198     #define CHECK_NEIGHBOUR(i)\
1199     {\
1200         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1201         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1202             i_maxref = ref;\
1203     }
1204
1205     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1206      * than those used by the neighbors */
1207     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1208         h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1209     {
1210         i_maxref = 0;
1211         CHECK_NEIGHBOUR(  -8 - 1 );
1212         CHECK_NEIGHBOUR(  -8 + 0 );
1213         CHECK_NEIGHBOUR(  -8 + 2 );
1214         CHECK_NEIGHBOUR(  -8 + 4 );
1215         CHECK_NEIGHBOUR(   0 - 1 );
1216         CHECK_NEIGHBOUR( 2*8 - 1 );
1217     }
1218     #undef CHECK_NEIGHBOUR
1219
1220     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1221         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1222
1223     for( i = 0; i < 4; i++ )
1224     {
1225         x264_me_t *l0m = &a->l0.me8x8[i];
1226         const int x8 = i%2;
1227         const int y8 = i/2;
1228
1229         m.i_pixel = PIXEL_8x8;
1230
1231         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1232         l0m->cost = INT_MAX;
1233         for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1234         {
1235             m.i_ref_cost = REF_COST( 0, i_ref );
1236
1237             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1238             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1239
1240             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1241             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1242             if( h->mb.ref_blind_dupe == i_ref )
1243             {
1244                 CP32( m.mv, a->l0.mvc[0][i+1] );
1245                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1246             }
1247             else
1248                 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1249
1250             m.cost += m.i_ref_cost;
1251
1252             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1253
1254             if( m.cost < l0m->cost )
1255                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1256             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1257                 i_ref = h->mb.ref_blind_dupe;
1258             else
1259                 i_ref++;
1260         }
1261         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1262         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1263
1264         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1265            are effectively zero. */
1266         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1267             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1268     }
1269
1270     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1271                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1272     /* P_8x8 ref0 has no ref cost */
1273     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1274                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1275         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1276     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1277     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1278 }
1279
1280 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1281 {
1282     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1283      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1284      * don't bother analysing the dupes. */
1285     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1286     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1287     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1288     int i_mvc;
1289     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1290     int i;
1291
1292     /* XXX Needed for x264_mb_predict_mv */
1293     h->mb.i_partition = D_8x8;
1294
1295     i_mvc = 1;
1296     CP32( mvc[0], a->l0.me16x16.mv );
1297
1298     for( i = 0; i < 4; i++ )
1299     {
1300         x264_me_t *m = &a->l0.me8x8[i];
1301         const int x8 = i%2;
1302         const int y8 = i/2;
1303
1304         m->i_pixel = PIXEL_8x8;
1305         m->i_ref_cost = i_ref_cost;
1306
1307         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1308         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1309         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1310
1311         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1312         x264_me_search( h, m, mvc, i_mvc );
1313
1314         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1315
1316         CP32( mvc[i_mvc], m->mv );
1317         i_mvc++;
1318
1319         /* mb type cost */
1320         m->cost += i_ref_cost;
1321         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1322             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1323     }
1324
1325     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1326                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1327     /* theoretically this should include 4*ref_cost,
1328      * but 3 seems a better approximation of cabac. */
1329     if( h->param.b_cabac )
1330         a->l0.i_cost8x8 -= i_ref_cost;
1331     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1332     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1333 }
1334
1335 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1336 {
1337     x264_me_t m;
1338     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1339     ALIGNED_4( int16_t mvc[3][2] );
1340     int i, j;
1341
1342     /* XXX Needed for x264_mb_predict_mv */
1343     h->mb.i_partition = D_16x8;
1344
1345     for( i = 0; i < 2; i++ )
1346     {
1347         x264_me_t *l0m = &a->l0.me16x8[i];
1348         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1349         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1350         const int ref8[2] = { minref, maxref };
1351         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1352
1353         m.i_pixel = PIXEL_16x8;
1354
1355         LOAD_FENC( &m, p_fenc, 0, 8*i );
1356         l0m->cost = INT_MAX;
1357         for( j = 0; j < i_ref8s; j++ )
1358         {
1359             const int i_ref = ref8[j];
1360             m.i_ref_cost = REF_COST( 0, i_ref );
1361
1362             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1363             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1364             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1365             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1366
1367             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1368             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1369
1370             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1371             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1372             /* We can only take this shortcut if the first search was performed on ref0. */
1373             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1374             {
1375                 /* We can just leave the MV from the previous ref search. */
1376                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1377             }
1378             else
1379                 x264_me_search( h, &m, mvc, 3 );
1380
1381             m.cost += m.i_ref_cost;
1382
1383             if( m.cost < l0m->cost )
1384                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1385         }
1386         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1387         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1388     }
1389
1390     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1391 }
1392
1393 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1394 {
1395     x264_me_t m;
1396     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1397     ALIGNED_4( int16_t mvc[3][2] );
1398     int i, j;
1399
1400     /* XXX Needed for x264_mb_predict_mv */
1401     h->mb.i_partition = D_8x16;
1402
1403     for( i = 0; i < 2; i++ )
1404     {
1405         x264_me_t *l0m = &a->l0.me8x16[i];
1406         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1407         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1408         const int ref8[2] = { minref, maxref };
1409         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1410
1411         m.i_pixel = PIXEL_8x16;
1412
1413         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1414         l0m->cost = INT_MAX;
1415         for( j = 0; j < i_ref8s; j++ )
1416         {
1417             const int i_ref = ref8[j];
1418             m.i_ref_cost = REF_COST( 0, i_ref );
1419
1420             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1421             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1422             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1423
1424             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1425             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1426
1427             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1428             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1429             /* We can only take this shortcut if the first search was performed on ref0. */
1430             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1431             {
1432                 /* We can just leave the MV from the previous ref search. */
1433                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1434             }
1435             else
1436                 x264_me_search( h, &m, mvc, 3 );
1437
1438             m.cost += m.i_ref_cost;
1439
1440             if( m.cost < l0m->cost )
1441                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1442         }
1443         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1444         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1445     }
1446
1447     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1448 }
1449
1450 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1451 {
1452     ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1453     uint8_t *pix2 = pix1+8;
1454     const int i_stride = h->mb.pic.i_stride[1];
1455     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1456     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1457     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1458     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1459     x264_weight_t *weight = h->sh.weight[i_ref];
1460
1461 #define CHROMA4x4MC( width, height, me, x, y ) \
1462     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1463     if( weight[1].weightfn ) \
1464         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1465     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1466     if( weight[2].weightfn ) \
1467         weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1468
1469
1470     if( pixel == PIXEL_4x4 )
1471     {
1472         x264_me_t *m = a->l0.me4x4[i8x8];
1473         CHROMA4x4MC( 2,2, m[0], 0,0 );
1474         CHROMA4x4MC( 2,2, m[1], 2,0 );
1475         CHROMA4x4MC( 2,2, m[2], 0,2 );
1476         CHROMA4x4MC( 2,2, m[3], 2,2 );
1477     }
1478     else if( pixel == PIXEL_8x4 )
1479     {
1480         x264_me_t *m = a->l0.me8x4[i8x8];
1481         CHROMA4x4MC( 4,2, m[0], 0,0 );
1482         CHROMA4x4MC( 4,2, m[1], 0,2 );
1483     }
1484     else
1485     {
1486         x264_me_t *m = a->l0.me4x8[i8x8];
1487         CHROMA4x4MC( 2,4, m[0], 0,0 );
1488         CHROMA4x4MC( 2,4, m[1], 2,0 );
1489     }
1490
1491     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1492          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1493 }
1494
1495 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1496 {
1497     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1498     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1499     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1500     int i4x4;
1501
1502     /* XXX Needed for x264_mb_predict_mv */
1503     h->mb.i_partition = D_8x8;
1504
1505     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1506     {
1507         const int idx = 4*i8x8 + i4x4;
1508         const int x4 = block_idx_x[idx];
1509         const int y4 = block_idx_y[idx];
1510         const int i_mvc = (i4x4 == 0);
1511
1512         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1513
1514         m->i_pixel = PIXEL_4x4;
1515
1516         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1517         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1518         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1519
1520         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1521         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1522
1523         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1524     }
1525     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1526                             a->l0.me4x4[i8x8][1].cost +
1527                             a->l0.me4x4[i8x8][2].cost +
1528                             a->l0.me4x4[i8x8][3].cost +
1529                             REF_COST( 0, i_ref ) +
1530                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1531     if( h->mb.b_chroma_me )
1532         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1533 }
1534
1535 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1536 {
1537     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1538     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1539     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1540     int i8x4;
1541
1542     /* XXX Needed for x264_mb_predict_mv */
1543     h->mb.i_partition = D_8x8;
1544
1545     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1546     {
1547         const int idx = 4*i8x8 + 2*i8x4;
1548         const int x4 = block_idx_x[idx];
1549         const int y4 = block_idx_y[idx];
1550         const int i_mvc = (i8x4 == 0);
1551
1552         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1553
1554         m->i_pixel = PIXEL_8x4;
1555
1556         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1557         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1558         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1559
1560         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1561         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1562
1563         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1564     }
1565     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1566                             REF_COST( 0, i_ref ) +
1567                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1568     if( h->mb.b_chroma_me )
1569         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1570 }
1571
1572 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1573 {
1574     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1575     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1576     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1577     int i4x8;
1578
1579     /* XXX Needed for x264_mb_predict_mv */
1580     h->mb.i_partition = D_8x8;
1581
1582     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1583     {
1584         const int idx = 4*i8x8 + i4x8;
1585         const int x4 = block_idx_x[idx];
1586         const int y4 = block_idx_y[idx];
1587         const int i_mvc = (i4x8 == 0);
1588
1589         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1590
1591         m->i_pixel = PIXEL_4x8;
1592
1593         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1594         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1595         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1596
1597         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1598         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1599
1600         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1601     }
1602     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1603                             REF_COST( 0, i_ref ) +
1604                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1605     if( h->mb.b_chroma_me )
1606         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1607 }
1608
1609 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1610 {
1611     /* Assumes that fdec still contains the results of
1612      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1613
1614     uint8_t **p_fenc = h->mb.pic.p_fenc;
1615     uint8_t **p_fdec = h->mb.pic.p_fdec;
1616     int i;
1617
1618     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1619     for( i = 0; i < 4; i++ )
1620     {
1621         const int x = (i&1)*8;
1622         const int y = (i>>1)*8;
1623         a->i_cost16x16direct +=
1624         a->i_cost8x8direct[i] =
1625             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1626
1627         /* mb type cost */
1628         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1629     }
1630 }
1631
1632 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1633 {
1634     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1635     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1636     uint8_t *src0, *src1;
1637     int stride0 = 16, stride1 = 16;
1638     int i_ref, i_mvc, l;
1639     ALIGNED_4( int16_t mvc[9][2] );
1640
1641     x264_me_t m;
1642     m.i_pixel = PIXEL_16x16;
1643
1644     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1645
1646     /* 16x16 Search on list 0 and list 1 */
1647     for( l = 0; l < 2; l++ )
1648     {
1649         int i_halfpel_thresh = INT_MAX;
1650         int *p_halfpel_thresh = h->mb.pic.i_fref[l]>1 ? &i_halfpel_thresh : NULL;
1651         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1652
1653         lX->me16x16.cost = INT_MAX;
1654         for( i_ref = 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1655         {
1656             m.i_ref_cost = REF_COST( l, i_ref );
1657
1658             /* search with ref */
1659             LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1660             x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1661             x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1662             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1663
1664             /* add ref cost */
1665             m.cost += m.i_ref_cost;
1666
1667             if( m.cost < lX->me16x16.cost )
1668                 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1669
1670             /* save mv for predicting neighbors */
1671             CP32( lX->mvc[i_ref][0], m.mv );
1672             CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1673         }
1674     }
1675
1676     /* get cost of BI mode */
1677     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1678     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1679     int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1680     src0 = h->mc.get_ref( pix0, &stride0,
1681                           h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1682                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1683     src1 = h->mc.get_ref( pix1, &stride1,
1684                           h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1685                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1686
1687     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1688
1689     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1690                      + ref_costs
1691                      + a->l0.bi16x16.cost_mv
1692                      + a->l1.bi16x16.cost_mv;
1693
1694
1695     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1696     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1697     {
1698         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1699                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1700         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1701                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1702         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1703                                 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1704                                 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1705         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1706                    + ref_costs + l0_mv_cost + l1_mv_cost;
1707         if( cost00 < a->i_cost16x16bi )
1708         {
1709             M32( a->l0.bi16x16.mv ) = 0;
1710             M32( a->l1.bi16x16.mv ) = 0;
1711             a->l0.bi16x16.cost_mv = l0_mv_cost;
1712             a->l1.bi16x16.cost_mv = l1_mv_cost;
1713             a->i_cost16x16bi = cost00;
1714         }
1715     }
1716
1717     /* mb type cost */
1718     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1719     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1720     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1721 }
1722
1723 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1724 {
1725     const int x = 2*(i%2);
1726     const int y = 2*(i/2);
1727
1728     switch( h->mb.i_sub_partition[i] )
1729     {
1730         case D_L0_8x8:
1731             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1732             break;
1733         case D_L0_8x4:
1734             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1735             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1736             break;
1737         case D_L0_4x8:
1738             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1739             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1740             break;
1741         case D_L0_4x4:
1742             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1743             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1744             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1745             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1746             break;
1747         default:
1748             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1749             break;
1750     }
1751 }
1752
1753 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1754 {
1755     const int x = 2*(idx&1);
1756     const int y = 2*(idx>>1);
1757     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1758     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1759     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1760     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1761 }
1762
1763 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1764     if( x264_mb_partition_listX_table[0][part] ) \
1765     { \
1766         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1767         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1768     } \
1769     else \
1770     { \
1771         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1772         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1773         if( b_mvd ) \
1774             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1775     } \
1776     if( x264_mb_partition_listX_table[1][part] ) \
1777     { \
1778         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1779         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1780     } \
1781     else \
1782     { \
1783         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1784         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1785         if( b_mvd ) \
1786             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1787     }
1788
1789 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1790 {
1791     int x = (i%2)*2;
1792     int y = (i/2)*2;
1793     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1794     {
1795         x264_mb_load_mv_direct8x8( h, i );
1796         if( b_mvd )
1797         {
1798             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1799             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1800             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1801         }
1802     }
1803     else
1804     {
1805         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1806     }
1807 }
1808 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1809 {
1810     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1811 }
1812 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1813 {
1814     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1815 }
1816 #undef CACHE_MV_BI
1817
1818 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1819 {
1820     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1821     int i_ref, i, l;
1822     int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1823
1824     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1825      * than those used by the neighbors */
1826     #define CHECK_NEIGHBOUR(i)\
1827     {\
1828         int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1829         if( ref > i_maxref[l] )\
1830             i_maxref[l] = ref;\
1831     }
1832
1833     for( l = 0; l < 2; l++ )
1834     {
1835         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1836         if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1837             h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1838         {
1839             i_maxref[l] = 0;
1840             CHECK_NEIGHBOUR(  -8 - 1 );
1841             CHECK_NEIGHBOUR(  -8 + 0 );
1842             CHECK_NEIGHBOUR(  -8 + 2 );
1843             CHECK_NEIGHBOUR(  -8 + 4 );
1844             CHECK_NEIGHBOUR(   0 - 1 );
1845             CHECK_NEIGHBOUR( 2*8 - 1 );
1846         }
1847     }
1848
1849     /* XXX Needed for x264_mb_predict_mv */
1850     h->mb.i_partition = D_8x8;
1851
1852     a->i_cost8x8bi = 0;
1853
1854     for( i = 0; i < 4; i++ )
1855     {
1856         int x8 = i%2;
1857         int y8 = i/2;
1858         int i_part_cost;
1859         int i_part_cost_bi;
1860         int stride[2] = {8,8};
1861         uint8_t *src[2];
1862         x264_me_t m;
1863         m.i_pixel = PIXEL_8x8;
1864         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1865
1866         for( l = 0; l < 2; l++ )
1867         {
1868             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1869
1870             lX->me8x8[i].cost = INT_MAX;
1871             for( i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1872             {
1873                 m.i_ref_cost = REF_COST( l, i_ref );;
1874
1875                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1876
1877                 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1878                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1879                 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1880                 m.cost += m.i_ref_cost;
1881
1882                 if( m.cost < lX->me8x8[i].cost )
1883                     h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1884
1885                 /* save mv for predicting other partitions within this MB */
1886                 CP32( lX->mvc[i_ref][i+1], m.mv );
1887             }
1888         }
1889
1890         /* BI mode */
1891         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1892                                 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1893         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1894                                 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1895         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1896                                 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1897
1898         i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1899                         + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost
1900                         + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1901
1902         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1903         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1904
1905         i_part_cost = a->l0.me8x8[i].cost;
1906         h->mb.i_sub_partition[i] = D_L0_8x8;
1907         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1908         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1909         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1910         a->i_cost8x8bi += i_part_cost;
1911
1912         /* XXX Needed for x264_mb_predict_mv */
1913         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1914     }
1915
1916     /* mb type cost */
1917     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1918 }
1919
1920 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1921 {
1922     uint8_t **p_fref[2] =
1923         { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1924           h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1925     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1926     int i, l;
1927
1928     /* XXX Needed for x264_mb_predict_mv */
1929     h->mb.i_partition = D_8x8;
1930
1931     a->i_cost8x8bi = 0;
1932
1933     for( i = 0; i < 4; i++ )
1934     {
1935         const int x8 = i%2;
1936         const int y8 = i/2;
1937         int i_part_cost;
1938         int i_part_cost_bi = 0;
1939         int stride[2] = {8,8};
1940         uint8_t *src[2];
1941
1942         for( l = 0; l < 2; l++ )
1943         {
1944             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1945             x264_me_t *m = &lX->me8x8[i];
1946             m->i_pixel = PIXEL_8x8;
1947             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1948
1949             m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
1950             m->i_ref = lX->me16x16.i_ref;
1951
1952             LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
1953
1954             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
1955             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1956             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1957             m->cost += m->i_ref_cost;
1958
1959             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1960
1961             /* save mv for predicting other partitions within this MB */
1962             CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
1963
1964             /* BI mode */
1965             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1966                                     m->mv[0], m->mv[1], 8, 8, weight_none );
1967             i_part_cost_bi += m->cost_mv + m->i_ref_cost;
1968         }
1969         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
1970         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1971                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1972         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1973         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1974
1975         i_part_cost = a->l0.me8x8[i].cost;
1976         h->mb.i_sub_partition[i] = D_L0_8x8;
1977         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1978         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1979         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1980         a->i_cost8x8bi += i_part_cost;
1981
1982         /* XXX Needed for x264_mb_predict_mv */
1983         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1984     }
1985
1986     /* mb type cost */
1987     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1988 }
1989
1990 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1991 {
1992     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1993     ALIGNED_4( int16_t mvc[3][2] );
1994     int i, j, l, i_ref;
1995
1996     h->mb.i_partition = D_16x8;
1997     a->i_cost16x8bi = 0;
1998
1999     for( i = 0; i < 2; i++ )
2000     {
2001         int i_part_cost;
2002         int i_part_cost_bi = 0;
2003         int stride[2] = {16,16};
2004         uint8_t *src[2];
2005         x264_me_t m;
2006         m.i_pixel = PIXEL_16x8;
2007         LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2008
2009         for( l = 0; l < 2; l++ )
2010         {
2011             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2012             int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2013             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2014             lX->me16x8[i].cost = INT_MAX;
2015             for( j = 0; j < i_ref8s; j++ )
2016             {
2017                 i_ref = ref8[j];
2018                 m.i_ref_cost = REF_COST( l, i_ref );;
2019
2020                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2021
2022                 CP32( mvc[0], lX->mvc[i_ref][0] );
2023                 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2024                 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2025
2026                 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2027                 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2028                 x264_me_search( h, &m, mvc, 3 );
2029                 m.cost += m.i_ref_cost;
2030
2031                 if( m.cost < lX->me16x8[i].cost )
2032                     h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2033             }
2034         }
2035
2036         /* BI mode */
2037         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2038                                 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2039         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2040                                 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2041         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2042                                 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2043
2044         i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2045                         + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2046                         + a->l1.me16x8[i].i_ref_cost;
2047
2048         i_part_cost = a->l0.me16x8[i].cost;
2049         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2050
2051         if( a->l1.me16x8[i].cost < i_part_cost )
2052         {
2053             i_part_cost = a->l1.me16x8[i].cost;
2054             a->i_mb_partition16x8[i] = D_L1_8x8;
2055         }
2056         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2057         {
2058             i_part_cost = i_part_cost_bi;
2059             a->i_mb_partition16x8[i] = D_BI_8x8;
2060         }
2061         a->i_cost16x8bi += i_part_cost;
2062
2063         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2064     }
2065
2066     /* mb type cost */
2067     a->i_mb_type16x8 = B_L0_L0
2068         + (a->i_mb_partition16x8[0]>>2) * 3
2069         + (a->i_mb_partition16x8[1]>>2);
2070     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2071 }
2072
2073 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2074 {
2075     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2076     ALIGNED_4( int16_t mvc[2][2] );
2077     int i, j, l, i_ref;
2078
2079     h->mb.i_partition = D_8x16;
2080     a->i_cost8x16bi = 0;
2081
2082     for( i = 0; i < 2; i++ )
2083     {
2084         int i_part_cost;
2085         int i_part_cost_bi = 0;
2086         int stride[2] = {8,8};
2087         uint8_t *src[2];
2088         x264_me_t m;
2089         m.i_pixel = PIXEL_8x16;
2090         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2091
2092         for( l = 0; l < 2; l++ )
2093         {
2094             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2095             int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2096             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2097             lX->me8x16[i].cost = INT_MAX;
2098             for( j = 0; j < i_ref8s; j++ )
2099             {
2100                 i_ref = ref8[j];
2101                 m.i_ref_cost = REF_COST( l, i_ref );
2102
2103                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2104
2105                 CP32( mvc[0], lX->mvc[i_ref][0] );
2106                 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2107                 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2108
2109                 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2110                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2111                 x264_me_search( h, &m, mvc, 3 );
2112                 m.cost += m.i_ref_cost;
2113
2114                 if( m.cost < lX->me8x16[i].cost )
2115                     h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2116             }
2117         }
2118
2119         /* BI mode */
2120         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2121                                 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2122         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2123                                 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2124         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2125
2126         i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2127                         + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2128                         + a->l1.me8x16[i].i_ref_cost;
2129
2130         i_part_cost = a->l0.me8x16[i].cost;
2131         a->i_mb_partition8x16[i] = D_L0_8x8;
2132
2133         if( a->l1.me8x16[i].cost < i_part_cost )
2134         {
2135             i_part_cost = a->l1.me8x16[i].cost;
2136             a->i_mb_partition8x16[i] = D_L1_8x8;
2137         }
2138         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2139         {
2140             i_part_cost = i_part_cost_bi;
2141             a->i_mb_partition8x16[i] = D_BI_8x8;
2142         }
2143         a->i_cost8x16bi += i_part_cost;
2144
2145         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2146     }
2147
2148     /* mb type cost */
2149     a->i_mb_type8x16 = B_L0_L0
2150         + (a->i_mb_partition8x16[0]>>2) * 3
2151         + (a->i_mb_partition8x16[1]>>2);
2152     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2153 }
2154
2155 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2156 {
2157     int thresh = i_satd * 5/4;
2158
2159     h->mb.i_type = P_L0;
2160     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2161     {
2162         h->mb.i_partition = D_16x16;
2163         x264_analyse_update_cache( h, a );
2164         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2165     }
2166
2167     if( a->l0.i_cost16x8 <= thresh )
2168     {
2169         h->mb.i_partition = D_16x8;
2170         x264_analyse_update_cache( h, a );
2171         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2172     }
2173     else
2174         a->l0.i_cost16x8 = COST_MAX;
2175
2176     if( a->l0.i_cost8x16 <= thresh )
2177     {
2178         h->mb.i_partition = D_8x16;
2179         x264_analyse_update_cache( h, a );
2180         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2181     }
2182     else
2183         a->l0.i_cost8x16 = COST_MAX;
2184
2185     if( a->l0.i_cost8x8 <= thresh )
2186     {
2187         h->mb.i_type = P_8x8;
2188         h->mb.i_partition = D_8x8;
2189         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2190         {
2191             int i;
2192             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2193             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2194             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2195             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2196             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2197              * for future blocks are those left over from previous RDO calls. */
2198             for( i = 0; i < 4; i++ )
2199             {
2200                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2201                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2202                 int subtype, btype = D_L0_8x8;
2203                 uint64_t bcost = COST_MAX64;
2204                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2205                 {
2206                     uint64_t cost;
2207                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2208                         continue;
2209                     h->mb.i_sub_partition[i] = subtype;
2210                     x264_mb_cache_mv_p8x8( h, a, i );
2211                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2212                     COPY2_IF_LT( bcost, cost, btype, subtype );
2213                 }
2214                 if( h->mb.i_sub_partition[i] != btype )
2215                 {
2216                     h->mb.i_sub_partition[i] = btype;
2217                     x264_mb_cache_mv_p8x8( h, a, i );
2218                 }
2219             }
2220         }
2221         else
2222             x264_analyse_update_cache( h, a );
2223         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2224     }
2225     else
2226         a->l0.i_cost8x8 = COST_MAX;
2227 }
2228
2229 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2230 {
2231     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2232
2233     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2234     {
2235         h->mb.i_type = B_DIRECT;
2236         /* Assumes direct/skip MC is still in fdec */
2237         /* Requires b-rdo to be done before intra analysis */
2238         h->mb.b_skip_mc = 1;
2239         x264_analyse_update_cache( h, a );
2240         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2241         h->mb.b_skip_mc = 0;
2242     }
2243
2244     //FIXME not all the update_cache calls are needed
2245     h->mb.i_partition = D_16x16;
2246     /* L0 */
2247     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2248     {
2249         h->mb.i_type = B_L0_L0;
2250         x264_analyse_update_cache( h, a );
2251         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2252     }
2253
2254     /* L1 */
2255     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2256     {
2257         h->mb.i_type = B_L1_L1;
2258         x264_analyse_update_cache( h, a );
2259         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2260     }
2261
2262     /* BI */
2263     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2264     {
2265         h->mb.i_type = B_BI_BI;
2266         x264_analyse_update_cache( h, a );
2267         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2268     }
2269
2270     /* 8x8 */
2271     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2272     {
2273         h->mb.i_type = B_8x8;
2274         h->mb.i_partition = D_8x8;
2275         x264_analyse_update_cache( h, a );
2276         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2277         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2278     }
2279
2280     /* 16x8 */
2281     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2282     {
2283         h->mb.i_type = a->i_mb_type16x8;
2284         h->mb.i_partition = D_16x8;
2285         x264_analyse_update_cache( h, a );
2286         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2287     }
2288
2289     /* 8x16 */
2290     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2291     {
2292         h->mb.i_type = a->i_mb_type8x16;
2293         h->mb.i_partition = D_8x16;
2294         x264_analyse_update_cache( h, a );
2295         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2296     }
2297 }
2298
2299 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2300 {
2301     int i_biweight;
2302     int i;
2303
2304     if( IS_INTRA(h->mb.i_type) )
2305         return;
2306
2307     switch( h->mb.i_partition )
2308     {
2309         case D_16x16:
2310             if( h->mb.i_type == B_BI_BI )
2311             {
2312                 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2313                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2314             }
2315             break;
2316         case D_16x8:
2317             for( i=0; i<2; i++ )
2318                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2319                 {
2320                     i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2321                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2322                 }
2323             break;
2324         case D_8x16:
2325             for( i=0; i<2; i++ )
2326                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2327                 {
2328                     i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2329                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2330                 }
2331             break;
2332         case D_8x8:
2333             for( i=0; i<4; i++ )
2334                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2335                 {
2336                     i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2337                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2338                 }
2339             break;
2340     }
2341 }
2342
2343 static inline void x264_mb_analyse_transform( x264_t *h )
2344 {
2345     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2346     {
2347         int i_cost4, i_cost8;
2348         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2349         x264_mb_mc( h );
2350
2351         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2352                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2353         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2354                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2355
2356         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2357         h->mb.b_skip_mc = 1;
2358     }
2359 }
2360
2361 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2362 {
2363     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2364     {
2365         int i_rd8;
2366         x264_analyse_update_cache( h, a );
2367         h->mb.b_transform_8x8 ^= 1;
2368         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2369         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2370
2371         if( *i_rd >= i_rd8 )
2372         {
2373             if( *i_rd > 0 )
2374                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2375             *i_rd = i_rd8;
2376         }
2377         else
2378             h->mb.b_transform_8x8 ^= 1;
2379     }
2380 }
2381
2382 /* Rate-distortion optimal QP selection.
2383  * FIXME: More than half of the benefit of this function seems to be
2384  * in the way it improves the coding of chroma DC (by decimating or
2385  * finding a better way to code a single DC coefficient.)
2386  * There must be a more efficient way to get that portion of the benefit
2387  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2388  * trick. */
2389 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2390 {
2391     int bcost, cost, direction, failures, prevcost, origcost;
2392     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2393     int last_qp_tried = 0;
2394     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2395     int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2396
2397     /* If CBP is already zero, don't raise the quantizer any higher. */
2398     for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2399     {
2400         /* Without psy-RD, require monotonicity when moving quant away from previous
2401          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2402          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2403          * allow 2 failures when moving quant towards previous quant.
2404          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2405         int threshold = (!!h->mb.i_psy_rd);
2406         /* Raise the threshold for failures if we're moving towards the last QP. */
2407         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2408             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2409             threshold++;
2410         h->mb.i_qp = orig_qp;
2411         failures = 0;
2412         prevcost = origcost;
2413
2414         /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2415          * (up to a point) will too.  So, jump down to where the threshold will kick in
2416          * and check the QP there.  If the CBP is still empty, skip the main loop.
2417          * If it isn't empty, we would have ended up having to check this QP anyways,
2418          * so as long as we store it for later lookup, we lose nothing. */
2419         int already_checked_qp = -1;
2420         int already_checked_cost = COST_MAX;
2421         if( direction == -1 )
2422         {
2423             if( !origcbp )
2424             {
2425                 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2426                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2427                 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2428                 if( !h->mb.cbp[h->mb.i_mb_xy] )
2429                 {
2430                     /* If our empty-CBP block is lower QP than the last QP,
2431                      * the last QP almost surely doesn't have a CBP either. */
2432                     if( h->mb.i_last_qp > h->mb.i_qp )
2433                         last_qp_tried = 1;
2434                     break;
2435                 }
2436                 already_checked_qp = h->mb.i_qp;
2437                 h->mb.i_qp = orig_qp;
2438             }
2439         }
2440
2441         h->mb.i_qp += direction;
2442         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2443         {
2444             if( h->mb.i_last_qp == h->mb.i_qp )
2445                 last_qp_tried = 1;
2446             if( h->mb.i_qp == already_checked_qp )
2447                 cost = already_checked_cost;
2448             else
2449             {
2450                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2451                 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2452                 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2453             }
2454
2455             /* We can't assume that the costs are monotonic over QPs.
2456              * Tie case-as-failure seems to give better results. */
2457             if( cost < prevcost )
2458                 failures = 0;
2459             else
2460                 failures++;
2461             prevcost = cost;
2462
2463             if( failures > threshold )
2464                 break;
2465             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2466                 break;
2467             h->mb.i_qp += direction;
2468         }
2469     }
2470
2471     /* Always try the last block's QP. */
2472     if( !last_qp_tried )
2473     {
2474         h->mb.i_qp = h->mb.i_last_qp;
2475         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2476         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2477         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2478     }
2479
2480     h->mb.i_qp = bqp;
2481     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2482
2483     /* Check transform again; decision from before may no longer be optimal. */
2484     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2485         x264_mb_transform_8x8_allowed( h ) )
2486     {
2487         h->mb.b_transform_8x8 ^= 1;
2488         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2489         if( cost > bcost )
2490             h->mb.b_transform_8x8 ^= 1;
2491     }
2492 }
2493
2494 /*****************************************************************************
2495  * x264_macroblock_analyse:
2496  *****************************************************************************/
2497 void x264_macroblock_analyse( x264_t *h )
2498 {
2499     x264_mb_analysis_t analysis;
2500     int i_cost = COST_MAX;
2501     int i;
2502
2503     h->mb.i_qp = x264_ratecontrol_qp( h );
2504     if( h->param.rc.i_aq_mode )
2505     {
2506         x264_adaptive_quant( h );
2507         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2508          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2509         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2510             h->mb.i_qp = h->mb.i_last_qp;
2511     }
2512
2513     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2514
2515     /*--------------------------- Do the analysis ---------------------------*/
2516     if( h->sh.i_type == SLICE_TYPE_I )
2517     {
2518 intra_analysis:
2519         if( analysis.i_mbrd )
2520             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2521         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2522         if( analysis.i_mbrd )
2523             x264_intra_rd( h, &analysis, COST_MAX );
2524
2525         i_cost = analysis.i_satd_i16x16;
2526         h->mb.i_type = I_16x16;
2527         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2528         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2529         if( analysis.i_satd_pcm < i_cost )
2530             h->mb.i_type = I_PCM;
2531
2532         else if( analysis.i_mbrd >= 2 )
2533             x264_intra_rd_refine( h, &analysis );
2534     }
2535     else if( h->sh.i_type == SLICE_TYPE_P )
2536     {
2537         int b_skip = 0;
2538
2539         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2540
2541         analysis.b_try_pskip = 0;
2542         if( analysis.b_force_intra )
2543         {
2544             if( !h->param.analyse.b_psy )
2545             {
2546                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2547                 goto intra_analysis;
2548             }
2549         }
2550         else
2551         {
2552             /* Fast P_SKIP detection */
2553             if( h->param.analyse.b_fast_pskip )
2554             {
2555                 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2556                     // FIXME don't need to check this if the reference frame is done
2557                     {}
2558                 else if( h->param.analyse.i_subpel_refine >= 3 )
2559                     analysis.b_try_pskip = 1;
2560                 else if( h->mb.i_mb_type_left == P_SKIP ||
2561                          h->mb.i_mb_type_top == P_SKIP ||
2562                          h->mb.i_mb_type_topleft == P_SKIP ||
2563                          h->mb.i_mb_type_topright == P_SKIP )
2564                     b_skip = x264_macroblock_probe_pskip( h );
2565             }
2566         }
2567
2568         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2569
2570         if( b_skip )
2571         {
2572             h->mb.i_type = P_SKIP;
2573             h->mb.i_partition = D_16x16;
2574             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2575         }
2576         else
2577         {
2578             const unsigned int flags = h->param.analyse.inter;
2579             int i_type;
2580             int i_partition;
2581             int i_thresh16x8;
2582             int i_satd_inter, i_satd_intra;
2583
2584             x264_mb_analyse_load_costs( h, &analysis );
2585
2586             x264_mb_analyse_inter_p16x16( h, &analysis );
2587
2588             if( h->mb.i_type == P_SKIP )
2589                 return;
2590
2591             if( flags & X264_ANALYSE_PSUB16x16 )
2592             {
2593                 if( h->param.analyse.b_mixed_references )
2594                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2595                 else
2596                     x264_mb_analyse_inter_p8x8( h, &analysis );
2597             }
2598
2599             /* Select best inter mode */
2600             i_type = P_L0;
2601             i_partition = D_16x16;
2602             i_cost = analysis.l0.me16x16.cost;
2603
2604             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2605                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2606             {
2607                 i_type = P_8x8;
2608                 i_partition = D_8x8;
2609                 i_cost = analysis.l0.i_cost8x8;
2610
2611                 /* Do sub 8x8 */
2612                 if( flags & X264_ANALYSE_PSUB8x8 )
2613                 {
2614                     for( i = 0; i < 4; i++ )
2615                     {
2616                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2617                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2618                         {
2619                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2620                             h->mb.i_sub_partition[i] = D_L0_4x4;
2621
2622                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2623                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2624                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2625
2626                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2627                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2628                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2629
2630                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2631                         }
2632                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2633                     }
2634                     analysis.l0.i_cost8x8 = i_cost;
2635                 }
2636             }
2637
2638             /* Now do 16x8/8x16 */
2639             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2640             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2641                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2642             {
2643                 x264_mb_analyse_inter_p16x8( h, &analysis );
2644                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2645
2646                 x264_mb_analyse_inter_p8x16( h, &analysis );
2647                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2648             }
2649
2650             h->mb.i_partition = i_partition;
2651
2652             /* refine qpel */
2653             //FIXME mb_type costs?
2654             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2655             {
2656                 /* refine later */
2657             }
2658             else if( i_partition == D_16x16 )
2659             {
2660                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2661                 i_cost = analysis.l0.me16x16.cost;
2662             }
2663             else if( i_partition == D_16x8 )
2664             {
2665                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2666                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2667                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2668             }
2669             else if( i_partition == D_8x16 )
2670             {
2671                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2672                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2673                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2674             }
2675             else if( i_partition == D_8x8 )
2676             {
2677                 int i8x8;
2678                 i_cost = 0;
2679                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2680                 {
2681                     switch( h->mb.i_sub_partition[i8x8] )
2682                     {
2683                         case D_L0_8x8:
2684                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2685                             i_cost += analysis.l0.me8x8[i8x8].cost;
2686                             break;
2687                         case D_L0_8x4:
2688                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2689                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2690                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2691                                       analysis.l0.me8x4[i8x8][1].cost;
2692                             break;
2693                         case D_L0_4x8:
2694                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2695                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2696                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2697                                       analysis.l0.me4x8[i8x8][1].cost;
2698                             break;
2699
2700                         case D_L0_4x4:
2701                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2702                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2703                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2704                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2705                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2706                                       analysis.l0.me4x4[i8x8][1].cost +
2707                                       analysis.l0.me4x4[i8x8][2].cost +
2708                                       analysis.l0.me4x4[i8x8][3].cost;
2709                             break;
2710                         default:
2711                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2712                             break;
2713                     }
2714                 }
2715             }
2716
2717             if( h->mb.b_chroma_me )
2718             {
2719                 x264_mb_analyse_intra_chroma( h, &analysis );
2720                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2721                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2722                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2723                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2724             }
2725             else
2726                 x264_mb_analyse_intra( h, &analysis, i_cost );
2727
2728             i_satd_inter = i_cost;
2729             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2730                                       analysis.i_satd_i8x8,
2731                                       analysis.i_satd_i4x4 );
2732
2733             if( analysis.i_mbrd )
2734             {
2735                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2736                 i_type = P_L0;
2737                 i_partition = D_16x16;
2738                 i_cost = analysis.l0.i_rd16x16;
2739                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2740                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2741                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2742                 h->mb.i_type = i_type;
2743                 h->mb.i_partition = i_partition;
2744                 if( i_cost < COST_MAX )
2745                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2746                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2747             }
2748
2749             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2750             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2751             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2752             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2753
2754             h->mb.i_type = i_type;
2755
2756             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2757             {
2758                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2759                  * it was an inter block. */
2760                 x264_analyse_update_cache( h, &analysis );
2761                 x264_macroblock_encode( h );
2762                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2763                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2764                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2765                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2766                 goto intra_analysis;
2767             }
2768
2769             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2770             {
2771                 if( IS_INTRA( h->mb.i_type ) )
2772                 {
2773                     x264_intra_rd_refine( h, &analysis );
2774                 }
2775                 else if( i_partition == D_16x16 )
2776                 {
2777                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2778                     analysis.l0.me16x16.cost = i_cost;
2779                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2780                 }
2781                 else if( i_partition == D_16x8 )
2782                 {
2783                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2784                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2785                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2786                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2787                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2788                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2789                 }
2790                 else if( i_partition == D_8x16 )
2791                 {
2792                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2793                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2794                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2795                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2796                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2797                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2798                 }
2799                 else if( i_partition == D_8x8 )
2800                 {
2801                     int i8x8;
2802                     x264_analyse_update_cache( h, &analysis );
2803                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2804                     {
2805                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2806                         {
2807                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2808                         }
2809                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2810                         {
2811                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2812                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2813                         }
2814                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2815                         {
2816                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2817                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2818                         }
2819                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2820                         {
2821                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2822                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2823                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2824                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2825                         }
2826                     }
2827                 }
2828             }
2829         }
2830     }
2831     else if( h->sh.i_type == SLICE_TYPE_B )
2832     {
2833         int i_bskip_cost = COST_MAX;
2834         int b_skip = 0;
2835
2836         if( analysis.i_mbrd )
2837             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2838
2839         h->mb.i_type = B_SKIP;
2840         if( h->mb.b_direct_auto_write )
2841         {
2842             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2843             for( i = 0; i < 2; i++ )
2844             {
2845                 int b_changed = 1;
2846                 h->sh.b_direct_spatial_mv_pred ^= 1;
2847                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2848                 if( analysis.b_direct_available )
2849                 {
2850                     if( b_changed )
2851                     {
2852                         x264_mb_mc( h );
2853                         b_skip = x264_macroblock_probe_bskip( h );
2854                     }
2855                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2856                 }
2857                 else
2858                     b_skip = 0;
2859             }
2860         }
2861         else
2862             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2863
2864         if( analysis.b_direct_available )
2865         {
2866             if( !h->mb.b_direct_auto_write )
2867                 x264_mb_mc( h );
2868             if( analysis.i_mbrd )
2869             {
2870                 i_bskip_cost = ssd_mb( h );
2871                 /* 6 = minimum cavlc cost of a non-skipped MB */
2872                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2873             }
2874             else if( !h->mb.b_direct_auto_write )
2875             {
2876                 /* Conditioning the probe on neighboring block types
2877                  * doesn't seem to help speed or quality. */
2878                 b_skip = x264_macroblock_probe_bskip( h );
2879             }
2880         }
2881
2882         if( !b_skip )
2883         {
2884             const unsigned int flags = h->param.analyse.inter;
2885             int i_type;
2886             int i_partition;
2887             int i_satd_inter;
2888             h->mb.b_skip_mc = 0;
2889
2890             x264_mb_analyse_load_costs( h, &analysis );
2891
2892             /* select best inter mode */
2893             /* direct must be first */
2894             if( analysis.b_direct_available )
2895                 x264_mb_analyse_inter_direct( h, &analysis );
2896
2897             x264_mb_analyse_inter_b16x16( h, &analysis );
2898
2899             i_type = B_L0_L0;
2900             i_partition = D_16x16;
2901             i_cost = analysis.l0.me16x16.cost;
2902             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2903             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2904             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2905
2906             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2907             {
2908                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2909                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2910                     i_bskip_cost < analysis.i_rd16x16bi &&
2911                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2912                     i_bskip_cost < analysis.l1.i_rd16x16 )
2913                 {
2914                     h->mb.i_type = B_SKIP;
2915                     x264_analyse_update_cache( h, &analysis );
2916                     return;
2917                 }
2918             }
2919
2920             if( flags & X264_ANALYSE_BSUB16x16 )
2921             {
2922                 if( h->param.analyse.b_mixed_references )
2923                     x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
2924                 else
2925                     x264_mb_analyse_inter_b8x8( h, &analysis );
2926
2927                 if( analysis.i_cost8x8bi < i_cost )
2928                 {
2929                     i_type = B_8x8;
2930                     i_partition = D_8x8;
2931                     i_cost = analysis.i_cost8x8bi;
2932
2933                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2934                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2935                     {
2936                         x264_mb_analyse_inter_b16x8( h, &analysis );
2937                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2938                                      i_type, analysis.i_mb_type16x8,
2939                                      i_partition, D_16x8 );
2940                     }
2941                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2942                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2943                     {
2944                         x264_mb_analyse_inter_b8x16( h, &analysis );
2945                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2946                                      i_type, analysis.i_mb_type8x16,
2947                                      i_partition, D_8x16 );
2948                     }
2949                 }
2950             }
2951
2952             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2953             {
2954                 /* refine later */
2955             }
2956             /* refine qpel */
2957             else if( i_partition == D_16x16 )
2958             {
2959                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2960                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2961                 if( i_type == B_L0_L0 )
2962                 {
2963                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2964                     i_cost = analysis.l0.me16x16.cost
2965                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2966                 }
2967                 else if( i_type == B_L1_L1 )
2968                 {
2969                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2970                     i_cost = analysis.l1.me16x16.cost
2971                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2972                 }
2973                 else if( i_type == B_BI_BI )
2974                 {
2975                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2976                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2977                 }
2978             }
2979             else if( i_partition == D_16x8 )
2980             {
2981                 for( i=0; i<2; i++ )
2982                 {
2983                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2984                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2985                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2986                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2987                 }
2988             }
2989             else if( i_partition == D_8x16 )
2990             {
2991                 for( i=0; i<2; i++ )
2992                 {
2993                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2994                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2995                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2996                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2997                 }
2998             }
2999             else if( i_partition == D_8x8 )
3000             {
3001                 for( i=0; i<4; i++ )
3002                 {
3003                     x264_me_t *m;
3004                     int i_part_cost_old;
3005                     int i_type_cost;
3006                     int i_part_type = h->mb.i_sub_partition[i];
3007                     int b_bidir = (i_part_type == D_BI_8x8);
3008
3009                     if( i_part_type == D_DIRECT_8x8 )
3010                         continue;
3011                     if( x264_mb_partition_listX_table[0][i_part_type] )
3012                     {
3013                         m = &analysis.l0.me8x8[i];
3014                         i_part_cost_old = m->cost;
3015                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3016                         m->cost -= i_type_cost;
3017                         x264_me_refine_qpel( h, m );
3018                         if( !b_bidir )
3019                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3020                     }
3021                     if( x264_mb_partition_listX_table[1][i_part_type] )
3022                     {
3023                         m = &analysis.l1.me8x8[i];
3024                         i_part_cost_old = m->cost;
3025                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3026                         m->cost -= i_type_cost;
3027                         x264_me_refine_qpel( h, m );
3028                         if( !b_bidir )
3029                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3030                     }
3031                     /* TODO: update mvp? */
3032                 }
3033             }
3034
3035             i_satd_inter = i_cost;
3036
3037             if( analysis.i_mbrd )
3038             {
3039                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3040                 i_type = B_SKIP;
3041                 i_cost = i_bskip_cost;
3042                 i_partition = D_16x16;
3043                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3044                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3045                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3046                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3047                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3048                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3049                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3050
3051                 h->mb.i_type = i_type;
3052                 h->mb.i_partition = i_partition;
3053             }
3054
3055             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3056
3057             if( analysis.i_mbrd )
3058             {
3059                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3060                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3061             }
3062
3063             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3064             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3065             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3066             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3067
3068             h->mb.i_type = i_type;
3069             h->mb.i_partition = i_partition;
3070
3071             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3072                 x264_intra_rd_refine( h, &analysis );
3073             if( h->mb.i_subpel_refine >= 5 )
3074                 x264_refine_bidir( h, &analysis );
3075
3076             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3077             {
3078                 int i_biweight;
3079                 x264_analyse_update_cache( h, &analysis );
3080
3081                 if( i_partition == D_16x16 )
3082                 {
3083                     if( i_type == B_L0_L0 )
3084                     {
3085                         analysis.l0.me16x16.cost = i_cost;
3086                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3087                     }
3088                     else if( i_type == B_L1_L1 )
3089                     {
3090                         analysis.l1.me16x16.cost = i_cost;
3091                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3092                     }
3093                     else if( i_type == B_BI_BI )
3094                     {
3095                         i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3096                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3097                     }
3098                 }
3099                 else if( i_partition == D_16x8 )
3100                 {
3101                     for( i = 0; i < 2; i++ )
3102                     {
3103                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3104                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3105                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3106                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3107                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3108                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3109                         {
3110                             i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3111                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3112                         }
3113                     }
3114                 }
3115                 else if( i_partition == D_8x16 )
3116                 {
3117                     for( i = 0; i < 2; i++ )
3118                     {
3119                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3120                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3121                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3122                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3123                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3124                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3125                         {
3126                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3127                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3128                         }
3129                     }
3130                 }
3131                 else if( i_partition == D_8x8 )
3132                 {
3133                     for( i = 0; i < 4; i++ )
3134                     {
3135                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3136                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3137                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3138                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3139                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3140                         {
3141                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3142                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3143                         }
3144                     }
3145                 }
3146             }
3147         }
3148     }
3149
3150     x264_analyse_update_cache( h, &analysis );
3151
3152     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3153      * without realizing it.  Check for this and account for it if necessary. */
3154     if( analysis.i_mbrd >= 2 )
3155     {
3156         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3157         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3158         int list = check_mv_lists[h->mb.i_type] - 1;
3159         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3160             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3161             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3162                 h->mb.i_partition = D_16x16;
3163     }
3164
3165     if( !analysis.i_mbrd )
3166         x264_mb_analyse_transform( h );
3167
3168     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3169         x264_mb_analyse_qp_rd( h, &analysis );
3170
3171     h->mb.b_trellis = h->param.analyse.i_trellis;
3172     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3173     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3174         x264_psy_trellis_init( h, 0 );
3175     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3176         h->mb.i_skip_intra = 0;
3177 }
3178
3179 /*-------------------- Update MB from the analysis ----------------------*/
3180 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3181 {
3182     int i;
3183
3184     switch( h->mb.i_type )
3185     {
3186         case I_4x4:
3187             for( i = 0; i < 16; i++ )
3188                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3189
3190             x264_mb_analyse_intra_chroma( h, a );
3191             break;
3192         case I_8x8:
3193             for( i = 0; i < 4; i++ )
3194                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3195
3196             x264_mb_analyse_intra_chroma( h, a );
3197             break;
3198         case I_16x16:
3199             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3200             x264_mb_analyse_intra_chroma( h, a );
3201             break;
3202
3203         case I_PCM:
3204             break;
3205
3206         case P_L0:
3207             switch( h->mb.i_partition )
3208             {
3209                 case D_16x16:
3210                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3211                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3212                     break;
3213
3214                 case D_16x8:
3215                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3216                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3217                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3218                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3219                     break;
3220
3221                 case D_8x16:
3222                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3223                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3224                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3225                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3226                     break;
3227
3228                 default:
3229                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3230                     break;
3231             }
3232             break;
3233
3234         case P_8x8:
3235             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3236             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3237             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3238             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3239             for( i = 0; i < 4; i++ )
3240                 x264_mb_cache_mv_p8x8( h, a, i );
3241             break;
3242
3243         case P_SKIP:
3244         {
3245             h->mb.i_partition = D_16x16;
3246             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3247             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3248             break;
3249         }
3250
3251         case B_SKIP:
3252         case B_DIRECT:
3253             h->mb.i_partition = h->mb.cache.direct_partition;
3254             x264_mb_load_mv_direct8x8( h, 0 );
3255             x264_mb_load_mv_direct8x8( h, 1 );
3256             x264_mb_load_mv_direct8x8( h, 2 );
3257             x264_mb_load_mv_direct8x8( h, 3 );
3258             break;
3259
3260         case B_8x8:
3261             /* optimize: cache might not need to be rewritten */
3262             for( i = 0; i < 4; i++ )
3263                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3264             break;
3265
3266         default: /* the rest of the B types */
3267             switch( h->mb.i_partition )
3268             {
3269             case D_16x16:
3270                 switch( h->mb.i_type )
3271                 {
3272                 case B_L0_L0:
3273                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3274                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3275
3276                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3277                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3278                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3279                     break;
3280                 case B_L1_L1:
3281                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3282                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3283                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3284
3285                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3286                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3287                     break;
3288                 case B_BI_BI:
3289                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3290                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3291
3292                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3293                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3294                     break;
3295                 }
3296                 break;
3297             case D_16x8:
3298                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3299                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3300                 break;
3301             case D_8x16:
3302                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3303                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3304                 break;
3305             default:
3306                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3307                 break;
3308             }
3309     }
3310
3311 #ifndef NDEBUG
3312     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3313     {
3314         int l;
3315         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3316         {
3317             int completed;
3318             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3319             if( ref < 0 )
3320                 continue;
3321             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3322             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3323             {
3324                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3325                 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3326                 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3327                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3328                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3329                 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3330                 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3331                 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3332                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3333                 x264_mb_analyse_intra( h, a, COST_MAX );
3334                 h->mb.i_type = I_16x16;
3335                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3336                 x264_mb_analyse_intra_chroma( h, a );
3337             }
3338         }
3339     }
3340 #endif
3341 }
3342
3343 #include "slicetype.c"
3344