git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "macroblock.h"
  31 #include "me.h"
  32 #include "ratecontrol.h"
  33 #include "analyse.h"
  34 #include "rdo.c"
  35
  36 typedef struct
  37 {
  38     /* 16x16 */
  39     int       i_rd16x16;
  40     x264_me_t me16x16;
  41     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  42
  43     /* 8x8 */
  44     int       i_cost8x8;
  45     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  46     ALIGNED_4( int16_t mvc[32][5][2] );
  47     x264_me_t me8x8[4];
  48
  49     /* Sub 4x4 */
  50     int       i_cost4x4[4]; /* cost per 8x8 partition */
  51     x264_me_t me4x4[4][4];
  52
  53     /* Sub 8x4 */
  54     int       i_cost8x4[4]; /* cost per 8x8 partition */
  55     x264_me_t me8x4[4][2];
  56
  57     /* Sub 4x8 */
  58     int       i_cost4x8[4]; /* cost per 8x8 partition */
  59     x264_me_t me4x8[4][2];
  60
  61     /* 16x8 */
  62     int       i_cost16x8;
  63     x264_me_t me16x8[2];
  64
  65     /* 8x16 */
  66     int       i_cost8x16;
  67     x264_me_t me8x16[2];
  68
  69 } x264_mb_analysis_list_t;
  70
  71 typedef struct
  72 {
  73     /* conduct the analysis using this lamda and QP */
  74     int i_lambda;
  75     int i_lambda2;
  76     int i_qp;
  77     uint16_t *p_cost_mv;
  78     uint16_t *p_cost_ref[2];
  79     int i_mbrd;
  80
  81
  82     /* I: Intra part */
  83     /* Take some shortcuts in intra search if intra is deemed unlikely */
  84     int b_fast_intra;
  85     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  86     int b_try_skip;
  87
  88     /* Luma part */
  89     int i_satd_i16x16;
  90     int i_satd_i16x16_dir[7];
  91     int i_predict16x16;
  92
  93     int i_satd_i8x8;
  94     int i_cbp_i8x8_luma;
  95     int i_satd_i8x8_dir[12][4];
  96     int i_predict8x8[4];
  97
  98     int i_satd_i4x4;
  99     int i_predict4x4[16];
 100
 101     int i_satd_pcm;
 102
 103     /* Chroma part */
 104     int i_satd_i8x8chroma;
 105     int i_satd_i8x8chroma_dir[7];
 106     int i_predict8x8chroma;
 107
 108     /* II: Inter part P/B frame */
 109     x264_mb_analysis_list_t l0;
 110     x264_mb_analysis_list_t l1;
 111
 112     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 113     int i_cost16x16direct;
 114     int i_cost8x8bi;
 115     int i_cost8x8direct[4];
 116     int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
 117     int i_cost_est16x8[2]; /* Per-partition estimated cost */
 118     int i_cost_est8x16[2];
 119     int i_cost16x8bi;
 120     int i_cost8x16bi;
 121     int i_rd16x16bi;
 122     int i_rd16x16direct;
 123     int i_rd16x8bi;
 124     int i_rd8x16bi;
 125     int i_rd8x8bi;
 126
 127     int i_mb_partition16x8[2]; /* mb_partition_e */
 128     int i_mb_partition8x16[2];
 129     int i_mb_type16x8; /* mb_class_e */
 130     int i_mb_type8x16;
 131
 132     int b_direct_available;
 133
 134 } x264_mb_analysis_t;
 135
 136 /* lambda = pow(2,qp/6-2) */
 137 const uint8_t x264_lambda_tab[52] = {
 138    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 139    1, 1, 1, 1,              /*  8-11 */
 140    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 141    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 142    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 143   16,18,20,23,25,29,32,36,  /* 36-43 */
 144   40,45,51,57,64,72,81,91   /* 44-51 */
 145 };
 146
 147 /* lambda2 = pow(lambda,2) * .9 * 256 */
 148 const int x264_lambda2_tab[52] = {
 149     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 150     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 151    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 152   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 153  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 154 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 155 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 156 };
 157
 158 const uint8_t x264_exp2_lut[64] = {
 159       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 160      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 161     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 162     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 163 };
 164
 165 const float x264_log2_lut[128] = {
 166     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 167     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 168     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 169     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 170     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 171     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 172     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 173     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 174     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 175     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 176     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 177     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 178     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 179     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 180     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 181     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 182 };
 183
 184 /* Avoid an int/float conversion. */
 185 const float x264_log2_lz_lut[32] = {
 186     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 187 };
 188
 189 // should the intra and inter lambdas be different?
 190 // I'm just matching the behaviour of deadzone quant.
 191 static const int x264_trellis_lambda2_tab[2][52] = {
 192     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 193     {    46,      58,      73,      92,     117,     147,
 194         185,     233,     294,     370,     466,     587,
 195         740,     932,    1174,    1480,    1864,    2349,
 196        2959,    3728,    4697,    5918,    7457,    9395,
 197       11837,   14914,   18790,   23674,   29828,   37581,
 198       47349,   59656,   75163,   94699,  119313,  150326,
 199      189399,  238627,  300652,  378798,  477255,  601304,
 200      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 201     3030384, 3818045, 4810435, 6060769 },
 202     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 203     {    27,      34,      43,      54,      68,      86,
 204         108,     136,     172,     216,     273,     343,
 205         433,     545,     687,     865,    1090,    1374,
 206        1731,    2180,    2747,    3461,    4361,    5494,
 207        6922,    8721,   10988,   13844,   17442,   21976,
 208       27688,   34885,   43953,   55377,   69771,   87906,
 209      110755,  139543,  175813,  221511,  279087,  351627,
 210      443023,  558174,  703255,  886046, 1116348, 1406511,
 211     1772093, 2232697, 2813022, 3544186 }
 212 };
 213
 214 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 215        16,    20,    25,    32,    40,    50,
 216        64,    80,   101,   128,   161,   203,
 217       256,   322,   406,   512,   645,   812,
 218      1024,  1290,  1625,  2048,  2580,  3250,
 219      4096,  5160,  6501,  8192, 10321, 13003,
 220     16384, 20642, 26007, 32768, 41285, 52015,
 221     65535
 222 };
 223
 224 /* TODO: calculate CABAC costs */
 225 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 226     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 227 };
 228 static const uint8_t i_mb_b16x8_cost_table[17] = {
 229     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 230 };
 231 static const uint8_t i_sub_mb_b_cost_table[13] = {
 232     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 233 };
 234 static const uint8_t i_sub_mb_p_cost_table[4] = {
 235     5, 3, 3, 1
 236 };
 237
 238 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 239
 240 static uint16_t x264_cost_ref[92][3][33];
 241 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 242
 243 int x264_analyse_init_costs( x264_t *h, int qp )
 244 {
 245     int lambda = x264_lambda_tab[qp];
 246     if( h->cost_mv[lambda] )
 247         return 0;
 248     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 249     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 250     h->cost_mv[lambda] += 2*4*2048;
 251     for( int i = 0; i <= 2*4*2048; i++ )
 252     {
 253         h->cost_mv[lambda][-i] =
 254         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 255     }
 256     x264_pthread_mutex_lock( &cost_ref_mutex );
 257     for( int i = 0; i < 3; i++ )
 258         for( int j = 0; j < 33; j++ )
 259             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 260     x264_pthread_mutex_unlock( &cost_ref_mutex );
 261     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 262     {
 263         for( int j = 0; j < 4; j++ )
 264         {
 265             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 266             h->cost_mv_fpel[lambda][j] += 2*2048;
 267             for( int i = -2*2048; i < 2*2048; i++ )
 268                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 269         }
 270     }
 271     return 0;
 272 fail:
 273     return -1;
 274 }
 275
 276 void x264_analyse_free_costs( x264_t *h )
 277 {
 278     for( int i = 0; i < 92; i++ )
 279     {
 280         if( h->cost_mv[i] )
 281             x264_free( h->cost_mv[i] - 2*4*2048 );
 282         if( h->cost_mv_fpel[i][0] )
 283             for( int j = 0; j < 4; j++ )
 284                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 285     }
 286 }
 287
 288 void x264_analyse_weight_frame( x264_t *h, int end )
 289 {
 290     for( int j = 0; j < h->i_ref0; j++ )
 291     {
 292         if( h->sh.weight[j][0].weightfn )
 293         {
 294             x264_frame_t *frame = h->fref0[j];
 295             int width = frame->i_width[0] + 2*PADH;
 296             int i_padv = PADV << h->param.b_interlaced;
 297             int offset, height;
 298             pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 299             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 300             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 301             h->fenc->i_lines_weighted += height;
 302             if( height )
 303                 for( int k = j; k < h->i_ref0; k++ )
 304                     if( h->sh.weight[k][0].weightfn )
 305                     {
 306                         pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 307                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 308                                                  src + offset, frame->i_stride[0],
 309                                                  width, height, &h->sh.weight[k][0] );
 310                     }
 311             break;
 312         }
 313     }
 314 }
 315
 316 /* initialize an array of lambda*nbits for all possible mvs */
 317 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 318 {
 319     a->p_cost_mv = h->cost_mv[a->i_lambda];
 320     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 321     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 322 }
 323
 324 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 325 {
 326     /* conduct the analysis using this lamda and QP */
 327     a->i_qp = h->mb.i_qp = i_qp;
 328     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 329
 330     a->i_lambda = x264_lambda_tab[i_qp];
 331     a->i_lambda2 = x264_lambda2_tab[i_qp];
 332
 333     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 334     if( h->param.analyse.i_trellis )
 335     {
 336         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 337         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 338         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 339         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 340     }
 341     h->mb.i_psy_rd_lambda = a->i_lambda;
 342     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 343     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 344 }
 345
 346 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 347 {
 348     int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 349
 350     /* mbrd == 1 -> RD mode decision */
 351     /* mbrd == 2 -> RD refinement */
 352     /* mbrd == 3 -> QPRD */
 353     a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
 354
 355     x264_mb_analyse_init_qp( h, a, i_qp );
 356
 357     h->mb.b_transform_8x8 = 0;
 358     h->mb.b_noise_reduction = 0;
 359
 360     /* I: Intra part */
 361     a->i_satd_i16x16 =
 362     a->i_satd_i8x8   =
 363     a->i_satd_i4x4   =
 364     a->i_satd_i8x8chroma = COST_MAX;
 365
 366     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 367     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 368
 369     a->b_fast_intra = 0;
 370     h->mb.i_skip_intra =
 371         h->mb.b_lossless ? 0 :
 372         a->i_mbrd ? 2 :
 373         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 374
 375     /* II: Inter part P/B frame */
 376     if( h->sh.i_type != SLICE_TYPE_I )
 377     {
 378         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 379         // limit motion search to a slightly smaller range than the theoretical limit,
 380         // since the search may go a few iterations past its given range
 381         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 382
 383         /* Calculate max allowed MV range */
 384 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 385         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 386         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 387         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 388         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 389         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 390         {
 391             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 392             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 393             /* If we're left of the refresh bar, don't reference right of it. */
 394             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 395                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 396         }
 397         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 398         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 399         if( h->mb.i_mb_x == 0 )
 400         {
 401             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 402             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 403             int thread_mvy_range = i_fmv_range;
 404
 405             if( h->i_thread_frames > 1 )
 406             {
 407                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 408                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 409                 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 410                 {
 411                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 412                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 413                     for( int j = 0; j < i_ref; j++ )
 414                     {
 415                         x264_frame_cond_wait( fref[j]->orig, thresh );
 416                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 417                     }
 418                 }
 419
 420                 if( h->param.b_deterministic )
 421                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 422                 if( h->mb.b_interlaced )
 423                     thread_mvy_range >>= 1;
 424
 425                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 426             }
 427
 428             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 429             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 430             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 431             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 432             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 433             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 434             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 435         }
 436 #undef CLIP_FMV
 437
 438         a->l0.me16x16.cost =
 439         a->l0.i_rd16x16    =
 440         a->l0.i_cost8x8    =
 441         a->l0.i_cost16x8   =
 442         a->l0.i_cost8x16   = COST_MAX;
 443         if( h->sh.i_type == SLICE_TYPE_B )
 444         {
 445             a->l1.me16x16.cost =
 446             a->l1.i_rd16x16    =
 447             a->l1.i_cost8x8    =
 448             a->i_cost8x8direct[0] =
 449             a->i_cost8x8direct[1] =
 450             a->i_cost8x8direct[2] =
 451             a->i_cost8x8direct[3] =
 452             a->l1.i_cost16x8   =
 453             a->l1.i_cost8x16   =
 454             a->i_rd16x16bi     =
 455             a->i_rd16x16direct =
 456             a->i_rd8x8bi       =
 457             a->i_rd16x8bi      =
 458             a->i_rd8x16bi      =
 459             a->i_cost16x16bi   =
 460             a->i_cost16x16direct =
 461             a->i_cost8x8bi     =
 462             a->i_cost16x8bi    =
 463             a->i_cost8x16bi    = COST_MAX;
 464         }
 465         else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
 466             for( int i = 0; i < 4; i++ )
 467             {
 468                 a->l0.i_cost4x4[i] =
 469                 a->l0.i_cost8x4[i] =
 470                 a->l0.i_cost4x8[i] = COST_MAX;
 471             }
 472
 473         /* Fast intra decision */
 474         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 475         {
 476             /* Always run in fast-intra mode for subme < 3 */
 477             if( h->mb.i_subpel_refine > 2 &&
 478               ( IS_INTRA( h->mb.i_mb_type_left ) ||
 479                 IS_INTRA( h->mb.i_mb_type_top ) ||
 480                 IS_INTRA( h->mb.i_mb_type_topleft ) ||
 481                 IS_INTRA( h->mb.i_mb_type_topright ) ||
 482                 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) ||
 483                 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
 484             { /* intra is likely */ }
 485             else
 486             {
 487                 a->b_fast_intra = 1;
 488             }
 489         }
 490         h->mb.b_skip_mc = 0;
 491         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 492             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 493         {
 494             a->b_force_intra = 1;
 495             a->b_fast_intra = 0;
 496         }
 497         else
 498             a->b_force_intra = 0;
 499     }
 500 }
 501
 502 /* Prediction modes allowed for various combinations of neighbors. */
 503 /* Terminated by a -1. */
 504 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 505 static const int8_t i16x16_mode_available[5][5] =
 506 {
 507     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 508     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 509     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 510     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 511     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 512 };
 513
 514 static const int8_t i8x8chroma_mode_available[5][5] =
 515 {
 516     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 517     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 518     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 519     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 520     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 521 };
 522
 523 static const int8_t i4x4_mode_available[5][10] =
 524 {
 525     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 526     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 527     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 528     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 529     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 530 };
 531
 532 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
 533 {
 534     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 535     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 536 }
 537
 538 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 539 {
 540     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 541     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 542 }
 543
 544 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
 545 {
 546     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 547     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 548 }
 549
 550 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 551 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 552 {
 553     ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
 554
 555     if( do_both_dct || h->mb.b_transform_8x8 )
 556         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 557     if( do_both_dct || !h->mb.b_transform_8x8 )
 558         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 559 }
 560
 561 /* Reset fenc satd scores cache for psy RD */
 562 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 563 {
 564     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 565         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 566     if( !h->mb.i_psy_rd )
 567         return;
 568     /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
 569     h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
 570     if( b_satd )
 571         h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 572 }
 573
 574 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 575 {
 576     if( a->i_satd_i8x8chroma < COST_MAX )
 577         return;
 578
 579     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 580
 581     /* 8x8 prediction selection for chroma */
 582     if( predict_mode[3] >= 0 && !h->mb.b_lossless )
 583     {
 584         int satdu[4], satdv[4];
 585         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 586         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 587         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 588         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 589         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 590         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 591
 592         for( ; *predict_mode >= 0; predict_mode++ )
 593         {
 594             int i_mode = *predict_mode;
 595             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 596
 597             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 598             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 599         }
 600     }
 601     else
 602     {
 603         for( ; *predict_mode >= 0; predict_mode++ )
 604         {
 605             int i_satd;
 606             int i_mode = *predict_mode;
 607
 608             /* we do the prediction */
 609             if( h->mb.b_lossless )
 610                 x264_predict_lossless_8x8_chroma( h, i_mode );
 611             else
 612             {
 613                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 614                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 615             }
 616
 617             /* we calculate the cost */
 618             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 619                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 620                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 621
 622             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 623             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 624         }
 625     }
 626
 627     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 628 }
 629
 630 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 631 {
 632     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 633     pixel *p_src = h->mb.pic.p_fenc[0];
 634     pixel *p_dst = h->mb.pic.p_fdec[0];
 635     static const int8_t intra_analysis_shortcut[2][2][5] =
 636     {{{I_PRED_4x4_HU, -1},
 637       {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1}},
 638      {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1},
 639       {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}};
 640
 641     int idx;
 642     int lambda = a->i_lambda;
 643
 644     /*---------------- Try all mode and calculate their score ---------------*/
 645
 646     /* 16x16 prediction selection */
 647     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 648
 649     /* Not heavily tuned */
 650     static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
 651     int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
 652
 653     if( !h->mb.b_lossless && predict_mode[3] >= 0 )
 654     {
 655         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 656         a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
 657         a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
 658         a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
 659         COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
 660         COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
 661         COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
 662
 663         /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
 664         if( a->i_satd_i16x16 <= i16x16_thresh )
 665         {
 666             h->predict_16x16[I_PRED_16x16_P]( p_dst );
 667             a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 668             a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
 669             COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
 670         }
 671     }
 672     else
 673     {
 674         for( ; *predict_mode >= 0; predict_mode++ )
 675         {
 676             int i_satd;
 677             int i_mode = *predict_mode;
 678
 679             if( h->mb.b_lossless )
 680                 x264_predict_lossless_16x16( h, i_mode );
 681             else
 682                 h->predict_16x16[i_mode]( p_dst );
 683
 684             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 685                      lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 686             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 687             a->i_satd_i16x16_dir[i_mode] = i_satd;
 688         }
 689     }
 690
 691     if( h->sh.i_type == SLICE_TYPE_B )
 692         /* cavlc mb type prefix */
 693         a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
 694
 695     if( a->i_satd_i16x16 > i16x16_thresh )
 696         return;
 697
 698     /* 8x8 prediction selection */
 699     if( flags & X264_ANALYSE_I8x8 )
 700     {
 701         ALIGNED_ARRAY_16( pixel, edge,[33] );
 702         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 703         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 704
 705         // FIXME some bias like in i4x4?
 706         int i_cost = lambda * 4; /* base predmode costs */
 707         h->mb.i_cbp_luma = 0;
 708
 709         if( h->sh.i_type == SLICE_TYPE_B )
 710             i_cost += lambda * i_mb_b_cost_table[I_8x8];
 711
 712         for( idx = 0;; idx++ )
 713         {
 714             int x = idx&1;
 715             int y = idx>>1;
 716             pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 717             pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 718             int i_best = COST_MAX;
 719             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 720
 721             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 722             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 723
 724             if( !h->mb.b_lossless && predict_mode[5] >= 0 )
 725             {
 726                 int satd[9];
 727                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 728                 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
 729                 satd[i_pred_mode] -= 3 * lambda;
 730                 for( int i = 2; i >= 0; i-- )
 731                 {
 732                     int cost = satd[i];
 733                     a->i_satd_i8x8_dir[i][idx] = cost + 4 * lambda;
 734                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 735                 }
 736
 737                 /* Take analysis shortcuts: don't analyse modes that are too
 738                  * far away direction-wise from the favored mode. */
 739                 if( a->i_mbrd < 1 + a->b_fast_intra )
 740                     predict_mode = intra_analysis_shortcut[predict_mode[8] >= 0][favor_vertical];
 741                 else
 742                     predict_mode += 3;
 743             }
 744
 745             for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
 746             {
 747                 int i_satd;
 748                 int i_mode = *predict_mode;
 749
 750                 if( h->mb.b_lossless )
 751                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 752                 else
 753                     h->predict_8x8[i_mode]( p_dst_by, edge );
 754
 755                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 756                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 757                     i_satd -= 3 * lambda;
 758
 759                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 760                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * lambda;
 761             }
 762             i_cost += i_best + 3 * lambda;
 763
 764             if( idx == 3 || i_cost > i_satd_thresh )
 765                 break;
 766
 767             /* we need to encode this block now (for next ones) */
 768             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 769             x264_mb_encode_i8x8( h, idx, a->i_qp );
 770
 771             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 772         }
 773
 774         if( idx == 3 )
 775         {
 776             a->i_satd_i8x8 = i_cost;
 777             if( h->mb.i_skip_intra )
 778             {
 779                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 780                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 781                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 782                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 783                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 784                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 785                 if( h->mb.i_skip_intra == 2 )
 786                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 787             }
 788         }
 789         else
 790         {
 791             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 792             a->i_satd_i8x8 = COST_MAX;
 793             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 794         }
 795         /* Not heavily tuned */
 796         static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
 797         if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
 798             return;
 799     }
 800
 801     /* 4x4 prediction selection */
 802     if( flags & X264_ANALYSE_I4x4 )
 803     {
 804         int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
 805         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 806         h->mb.i_cbp_luma = 0;
 807
 808         if( a->i_mbrd )
 809             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 810
 811         if( h->sh.i_type == SLICE_TYPE_B )
 812             i_cost += lambda * i_mb_b_cost_table[I_4x4];
 813
 814         for( idx = 0;; idx++ )
 815         {
 816             pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
 817             pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 818             int i_best = COST_MAX;
 819             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 820
 821             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 822
 823             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 824                 /* emulate missing topright samples */
 825                 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
 826
 827             if( !h->mb.b_lossless && predict_mode[5] >= 0 )
 828             {
 829                 int satd[9];
 830                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 831                 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
 832                 satd[i_pred_mode] -= 3 * lambda;
 833                 for( int i = 2; i >= 0; i-- )
 834                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 835
 836                 /* Take analysis shortcuts: don't analyse modes that are too
 837                  * far away direction-wise from the favored mode. */
 838                 if( a->i_mbrd < 1 + a->b_fast_intra )
 839                     predict_mode = intra_analysis_shortcut[predict_mode[8] >= 0][favor_vertical];
 840                 else
 841                     predict_mode += 3;
 842             }
 843
 844             if( i_best > 0 )
 845             {
 846                 for( ; *predict_mode >= 0; predict_mode++ )
 847                 {
 848                     int i_satd;
 849                     int i_mode = *predict_mode;
 850
 851                     if( h->mb.b_lossless )
 852                         x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 853                     else
 854                         h->predict_4x4[i_mode]( p_dst_by );
 855
 856                     i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 857                     if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 858                     {
 859                         i_satd -= lambda * 3;
 860                         if( i_satd <= 0 )
 861                         {
 862                             i_best = i_satd;
 863                             a->i_predict4x4[idx] = i_mode;
 864                             break;
 865                         }
 866                     }
 867
 868                     COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 869                 }
 870             }
 871             i_cost += i_best + 3 * lambda;
 872
 873             if( i_cost > i_satd_thresh || idx == 15 )
 874                 break;
 875
 876             /* we need to encode this block now (for next ones) */
 877             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 878             x264_mb_encode_i4x4( h, idx, a->i_qp );
 879
 880             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 881         }
 882         if( idx == 15 )
 883         {
 884             a->i_satd_i4x4 = i_cost;
 885             if( h->mb.i_skip_intra )
 886             {
 887                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 888                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 889                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 890                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 891                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 892                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 893                 if( h->mb.i_skip_intra == 2 )
 894                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 895             }
 896         }
 897         else
 898             a->i_satd_i4x4 = COST_MAX;
 899     }
 900 }
 901
 902 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 903 {
 904     if( a->i_satd_i16x16 <= i_satd_thresh )
 905     {
 906         h->mb.i_type = I_16x16;
 907         x264_analyse_update_cache( h, a );
 908         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 909     }
 910     else
 911         a->i_satd_i16x16 = COST_MAX;
 912
 913     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 914     {
 915         h->mb.i_type = I_4x4;
 916         x264_analyse_update_cache( h, a );
 917         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 918     }
 919     else
 920         a->i_satd_i4x4 = COST_MAX;
 921
 922     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 923     {
 924         h->mb.i_type = I_8x8;
 925         x264_analyse_update_cache( h, a );
 926         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 927         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 928     }
 929     else
 930         a->i_satd_i8x8 = COST_MAX;
 931 }
 932
 933 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 934 {
 935     pixel *p_dst = h->mb.pic.p_fdec[0];
 936     uint64_t i_satd, i_best;
 937     h->mb.i_skip_intra = 0;
 938
 939     if( h->mb.i_type == I_16x16 )
 940     {
 941         int old_pred_mode = a->i_predict16x16;
 942         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 943         int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 944         i_best = a->i_satd_i16x16;
 945         for( ; *predict_mode >= 0; predict_mode++ )
 946         {
 947             int i_mode = *predict_mode;
 948             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 949                 continue;
 950             h->mb.i_intra16x16_pred_mode = i_mode;
 951             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 952             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 953         }
 954     }
 955
 956     /* RD selection for chroma prediction */
 957     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 958     if( predict_mode[1] >= 0 )
 959     {
 960         int8_t predict_mode_sorted[4];
 961         int i_max;
 962         int i_thresh = a->i_satd_i8x8chroma * 5/4;
 963
 964         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 965         {
 966             int i_mode = *predict_mode;
 967             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 968                 predict_mode_sorted[i_max++] = i_mode;
 969         }
 970
 971         if( i_max > 0 )
 972         {
 973             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 974             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 975             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 976              * coefs for the current chroma mode are still around, so we only
 977              * have to recount the bits. */
 978             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 979             for( int i = 0; i < i_max; i++ )
 980             {
 981                 int i_mode = predict_mode_sorted[i];
 982                 if( h->mb.b_lossless )
 983                     x264_predict_lossless_8x8_chroma( h, i_mode );
 984                 else
 985                 {
 986                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 987                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 988                 }
 989                 /* if we've already found a mode that needs no residual, then
 990                  * probably any mode with a residual will be worse.
 991                  * so avoid dct on the remaining modes to improve speed. */
 992                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 993                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 994             }
 995             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 996             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 997         }
 998     }
 999
1000     if( h->mb.i_type == I_4x4 )
1001     {
1002         pixel4 pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1003         int i_nnz = 0;
1004         for( int idx = 0; idx < 16; idx++ )
1005         {
1006             pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1007             i_best = COST_MAX64;
1008
1009             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1010
1011             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1012                 /* emulate missing topright samples */
1013                 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1014
1015             for( ; *predict_mode >= 0; predict_mode++ )
1016             {
1017                 int i_mode = *predict_mode;
1018                 if( h->mb.b_lossless )
1019                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1020                 else
1021                     h->predict_4x4[i_mode]( p_dst_by );
1022                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1023
1024                 if( i_best > i_satd )
1025                 {
1026                     a->i_predict4x4[idx] = i_mode;
1027                     i_best = i_satd;
1028                     pels[0] = MPIXEL_X4( p_dst_by+0*FDEC_STRIDE );
1029                     pels[1] = MPIXEL_X4( p_dst_by+1*FDEC_STRIDE );
1030                     pels[2] = MPIXEL_X4( p_dst_by+2*FDEC_STRIDE );
1031                     pels[3] = MPIXEL_X4( p_dst_by+3*FDEC_STRIDE );
1032                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1033                 }
1034             }
1035
1036             MPIXEL_X4( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1037             MPIXEL_X4( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1038             MPIXEL_X4( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1039             MPIXEL_X4( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1040             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1041
1042             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1043         }
1044     }
1045     else if( h->mb.i_type == I_8x8 )
1046     {
1047         ALIGNED_ARRAY_16( pixel, edge,[33] );
1048         for( int idx = 0; idx < 4; idx++ )
1049         {
1050             pixel4 pels_h[2] = {0};
1051             pixel pels_v[7] = {0};
1052             uint16_t i_nnz[2] = {0}; //shut up gcc
1053             pixel *p_dst_by;
1054             int cbp_luma_new = 0;
1055             int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1056
1057             i_best = COST_MAX64;
1058             int x = idx&1;
1059             int y = idx>>1;
1060             int s8 = X264_SCAN8_0 + 2*x + 16*y;
1061
1062             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1063             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1064             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1065
1066             for( ; *predict_mode >= 0; predict_mode++ )
1067             {
1068                 int i_mode = *predict_mode;
1069                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1070                     continue;
1071
1072                 if( h->mb.b_lossless )
1073                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1074                 else
1075                     h->predict_8x8[i_mode]( p_dst_by, edge );
1076                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1077                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1078
1079                 if( i_best > i_satd )
1080                 {
1081                     a->i_predict8x8[idx] = i_mode;
1082                     cbp_luma_new = h->mb.i_cbp_luma;
1083                     i_best = i_satd;
1084
1085                     pels_h[0] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 );
1086                     pels_h[1] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 );
1087                     if( !(idx&1) )
1088                         for( int j = 0; j < 7; j++ )
1089                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1090                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
1091                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
1092                 }
1093             }
1094             a->i_cbp_i8x8_luma = cbp_luma_new;
1095             MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 ) = pels_h[0];
1096             MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 ) = pels_h[1];
1097             if( !(idx&1) )
1098                 for( int j = 0; j < 7; j++ )
1099                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1100             M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
1101             M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
1102
1103             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1104         }
1105     }
1106 }
1107
1108 #define LOAD_FENC( m, src, xoff, yoff) \
1109     (m)->p_cost_mv = a->p_cost_mv; \
1110     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1111     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1112     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1113     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1114     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1115
1116 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1117     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1118     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1119     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1120     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1121     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1122     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1123     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1124     (m)->weight = weight_none; \
1125     (m)->i_ref = ref;
1126
1127 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1128     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1129     (m)->weight = h->sh.weight[i_ref];
1130
1131 #define REF_COST(list, ref) \
1132     (a->p_cost_ref[list][ref])
1133
1134 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1135 {
1136     x264_me_t m;
1137     int i_mvc;
1138     ALIGNED_4( int16_t mvc[8][2] );
1139     int i_halfpel_thresh = INT_MAX;
1140     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1141
1142     /* 16x16 Search on all ref frame */
1143     m.i_pixel = PIXEL_16x16;
1144     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1145
1146     a->l0.me16x16.cost = INT_MAX;
1147     for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1148     {
1149         m.i_ref_cost = REF_COST( 0, i_ref );
1150         i_halfpel_thresh -= m.i_ref_cost;
1151
1152         /* search with ref */
1153         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1154         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1155
1156         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1157
1158         if( h->mb.ref_blind_dupe == i_ref )
1159         {
1160             CP32( m.mv, a->l0.mvc[0][0] );
1161             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1162         }
1163         else
1164         {
1165             x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1166             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1167         }
1168
1169         /* save mv for predicting neighbors */
1170         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1171         CP32( a->l0.mvc[i_ref][0], m.mv );
1172
1173         /* early termination
1174          * SSD threshold would probably be better than SATD */
1175         if( i_ref == 0
1176             && a->b_try_skip
1177             && m.cost-m.cost_mv < 300*a->i_lambda
1178             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1179               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1180             && x264_macroblock_probe_pskip( h ) )
1181         {
1182             h->mb.i_type = P_SKIP;
1183             x264_analyse_update_cache( h, a );
1184             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1185             return;
1186         }
1187
1188         m.cost += m.i_ref_cost;
1189         i_halfpel_thresh += m.i_ref_cost;
1190
1191         if( m.cost < a->l0.me16x16.cost )
1192             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1193     }
1194
1195     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1196     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1197
1198     h->mb.i_type = P_L0;
1199     if( a->i_mbrd )
1200     {
1201         x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1202         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1203         {
1204             h->mb.i_partition = D_16x16;
1205             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1206             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1207             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1208                 h->mb.i_type = P_SKIP;
1209         }
1210     }
1211 }
1212
1213 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1214 {
1215     x264_me_t m;
1216     pixel **p_fenc = h->mb.pic.p_fenc;
1217     int i_maxref = h->mb.pic.i_fref[0]-1;
1218
1219     h->mb.i_partition = D_8x8;
1220
1221     #define CHECK_NEIGHBOUR(i)\
1222     {\
1223         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1224         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1225             i_maxref = ref;\
1226     }
1227
1228     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1229      * than those used by the neighbors */
1230     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1231         h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1232     {
1233         i_maxref = 0;
1234         CHECK_NEIGHBOUR(  -8 - 1 );
1235         CHECK_NEIGHBOUR(  -8 + 0 );
1236         CHECK_NEIGHBOUR(  -8 + 2 );
1237         CHECK_NEIGHBOUR(  -8 + 4 );
1238         CHECK_NEIGHBOUR(   0 - 1 );
1239         CHECK_NEIGHBOUR( 2*8 - 1 );
1240     }
1241     #undef CHECK_NEIGHBOUR
1242
1243     for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1244         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1245
1246     for( int i = 0; i < 4; i++ )
1247     {
1248         x264_me_t *l0m = &a->l0.me8x8[i];
1249         int x8 = i&1;
1250         int y8 = i>>1;
1251
1252         m.i_pixel = PIXEL_8x8;
1253
1254         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1255         l0m->cost = INT_MAX;
1256         for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1257         {
1258             m.i_ref_cost = REF_COST( 0, i_ref );
1259
1260             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1261             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1262
1263             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1264             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1265             if( h->mb.ref_blind_dupe == i_ref )
1266             {
1267                 CP32( m.mv, a->l0.mvc[0][i+1] );
1268                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1269             }
1270             else
1271                 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1272
1273             m.cost += m.i_ref_cost;
1274
1275             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1276
1277             if( m.cost < l0m->cost )
1278                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1279             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1280                 i_ref = h->mb.ref_blind_dupe;
1281             else
1282                 i_ref++;
1283         }
1284         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1285         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1286
1287         a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1288
1289         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1290            are effectively zero. */
1291         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1292             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1293     }
1294
1295     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1296                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1297     /* P_8x8 ref0 has no ref cost */
1298     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1299                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1300         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1301     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1302     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1303 }
1304
1305 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1306 {
1307     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1308      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1309      * don't bother analysing the dupes. */
1310     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1311     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1312     pixel **p_fenc = h->mb.pic.p_fenc;
1313     int i_mvc;
1314     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1315
1316     /* XXX Needed for x264_mb_predict_mv */
1317     h->mb.i_partition = D_8x8;
1318
1319     i_mvc = 1;
1320     CP32( mvc[0], a->l0.me16x16.mv );
1321
1322     for( int i = 0; i < 4; i++ )
1323     {
1324         x264_me_t *m = &a->l0.me8x8[i];
1325         int x8 = i&1;
1326         int y8 = i>>1;
1327
1328         m->i_pixel = PIXEL_8x8;
1329         m->i_ref_cost = i_ref_cost;
1330
1331         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1332         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1333         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1334
1335         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1336         x264_me_search( h, m, mvc, i_mvc );
1337
1338         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1339
1340         CP32( mvc[i_mvc], m->mv );
1341         i_mvc++;
1342
1343         a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1344
1345         /* mb type cost */
1346         m->cost += i_ref_cost;
1347         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1348             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1349     }
1350
1351     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1352                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1353     /* theoretically this should include 4*ref_cost,
1354      * but 3 seems a better approximation of cabac. */
1355     if( h->param.b_cabac )
1356         a->l0.i_cost8x8 -= i_ref_cost;
1357     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1358     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1359 }
1360
1361 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1362 {
1363     x264_me_t m;
1364     pixel **p_fenc = h->mb.pic.p_fenc;
1365     ALIGNED_4( int16_t mvc[3][2] );
1366
1367     /* XXX Needed for x264_mb_predict_mv */
1368     h->mb.i_partition = D_16x8;
1369
1370     for( int i = 0; i < 2; i++ )
1371     {
1372         x264_me_t *l0m = &a->l0.me16x8[i];
1373         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1374         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1375         const int ref8[2] = { minref, maxref };
1376         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1377
1378         m.i_pixel = PIXEL_16x8;
1379
1380         LOAD_FENC( &m, p_fenc, 0, 8*i );
1381         l0m->cost = INT_MAX;
1382         for( int j = 0; j < i_ref8s; j++ )
1383         {
1384             const int i_ref = ref8[j];
1385             m.i_ref_cost = REF_COST( 0, i_ref );
1386
1387             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1388             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1389             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1390             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1391
1392             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1393             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1394
1395             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1396             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1397             /* We can only take this shortcut if the first search was performed on ref0. */
1398             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1399             {
1400                 /* We can just leave the MV from the previous ref search. */
1401                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1402             }
1403             else
1404                 x264_me_search( h, &m, mvc, 3 );
1405
1406             m.cost += m.i_ref_cost;
1407
1408             if( m.cost < l0m->cost )
1409                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1410         }
1411
1412         /* Early termination based on the current SATD score of partition[0]
1413            plus the estimated SATD score of partition[1] */
1414         if( !i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1415         {
1416             a->l0.i_cost16x8 = COST_MAX;
1417             return;
1418         }
1419
1420         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1421         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1422     }
1423
1424     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1425 }
1426
1427 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1428 {
1429     x264_me_t m;
1430     pixel **p_fenc = h->mb.pic.p_fenc;
1431     ALIGNED_4( int16_t mvc[3][2] );
1432
1433     /* XXX Needed for x264_mb_predict_mv */
1434     h->mb.i_partition = D_8x16;
1435
1436     for( int i = 0; i < 2; i++ )
1437     {
1438         x264_me_t *l0m = &a->l0.me8x16[i];
1439         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1440         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1441         const int ref8[2] = { minref, maxref };
1442         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1443
1444         m.i_pixel = PIXEL_8x16;
1445
1446         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1447         l0m->cost = INT_MAX;
1448         for( int j = 0; j < i_ref8s; j++ )
1449         {
1450             const int i_ref = ref8[j];
1451             m.i_ref_cost = REF_COST( 0, i_ref );
1452
1453             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1454             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1455             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1456
1457             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1458             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1459
1460             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1461             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1462             /* We can only take this shortcut if the first search was performed on ref0. */
1463             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1464             {
1465                 /* We can just leave the MV from the previous ref search. */
1466                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1467             }
1468             else
1469                 x264_me_search( h, &m, mvc, 3 );
1470
1471             m.cost += m.i_ref_cost;
1472
1473             if( m.cost < l0m->cost )
1474                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1475         }
1476
1477         /* Early termination based on the current SATD score of partition[0]
1478            plus the estimated SATD score of partition[1] */
1479         if( !i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1480         {
1481             a->l0.i_cost8x16 = COST_MAX;
1482             return;
1483         }
1484
1485         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1486         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1487     }
1488
1489     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1490 }
1491
1492 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1493 {
1494     ALIGNED_ARRAY_8( pixel, pix1,[16*8] );
1495     pixel *pix2 = pix1+8;
1496     const int i_stride = h->mb.pic.i_stride[1];
1497     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1498     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1499     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1500     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1501     x264_weight_t *weight = h->sh.weight[i_ref];
1502
1503 #define CHROMA4x4MC( width, height, me, x, y ) \
1504     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1505     if( weight[1].weightfn ) \
1506         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1507     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1508     if( weight[2].weightfn ) \
1509         weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1510
1511
1512     if( size == PIXEL_4x4 )
1513     {
1514         x264_me_t *m = a->l0.me4x4[i8x8];
1515         CHROMA4x4MC( 2,2, m[0], 0,0 );
1516         CHROMA4x4MC( 2,2, m[1], 2,0 );
1517         CHROMA4x4MC( 2,2, m[2], 0,2 );
1518         CHROMA4x4MC( 2,2, m[3], 2,2 );
1519     }
1520     else if( size == PIXEL_8x4 )
1521     {
1522         x264_me_t *m = a->l0.me8x4[i8x8];
1523         CHROMA4x4MC( 4,2, m[0], 0,0 );
1524         CHROMA4x4MC( 4,2, m[1], 0,2 );
1525     }
1526     else
1527     {
1528         x264_me_t *m = a->l0.me4x8[i8x8];
1529         CHROMA4x4MC( 2,4, m[0], 0,0 );
1530         CHROMA4x4MC( 2,4, m[1], 2,0 );
1531     }
1532
1533     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1534          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1535 }
1536
1537 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1538 {
1539     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1540     pixel **p_fenc = h->mb.pic.p_fenc;
1541     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1542
1543     /* XXX Needed for x264_mb_predict_mv */
1544     h->mb.i_partition = D_8x8;
1545
1546     for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1547     {
1548         const int idx = 4*i8x8 + i4x4;
1549         const int x4 = block_idx_x[idx];
1550         const int y4 = block_idx_y[idx];
1551         const int i_mvc = (i4x4 == 0);
1552
1553         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1554
1555         m->i_pixel = PIXEL_4x4;
1556
1557         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1558         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1559         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1560
1561         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1562         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1563
1564         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1565     }
1566     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1567                             a->l0.me4x4[i8x8][1].cost +
1568                             a->l0.me4x4[i8x8][2].cost +
1569                             a->l0.me4x4[i8x8][3].cost +
1570                             REF_COST( 0, i_ref ) +
1571                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1572     if( h->mb.b_chroma_me )
1573         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1574 }
1575
1576 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1577 {
1578     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1579     pixel **p_fenc = h->mb.pic.p_fenc;
1580     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1581
1582     /* XXX Needed for x264_mb_predict_mv */
1583     h->mb.i_partition = D_8x8;
1584
1585     for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1586     {
1587         const int idx = 4*i8x8 + 2*i8x4;
1588         const int x4 = block_idx_x[idx];
1589         const int y4 = block_idx_y[idx];
1590         const int i_mvc = (i8x4 == 0);
1591
1592         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1593
1594         m->i_pixel = PIXEL_8x4;
1595
1596         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1597         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1598         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1599
1600         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1601         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1602
1603         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1604     }
1605     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1606                             REF_COST( 0, i_ref ) +
1607                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1608     if( h->mb.b_chroma_me )
1609         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1610 }
1611
1612 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1613 {
1614     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1615     pixel **p_fenc = h->mb.pic.p_fenc;
1616     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1617
1618     /* XXX Needed for x264_mb_predict_mv */
1619     h->mb.i_partition = D_8x8;
1620
1621     for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1622     {
1623         const int idx = 4*i8x8 + i4x8;
1624         const int x4 = block_idx_x[idx];
1625         const int y4 = block_idx_y[idx];
1626         const int i_mvc = (i4x8 == 0);
1627
1628         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1629
1630         m->i_pixel = PIXEL_4x8;
1631
1632         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1633         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1634         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1635
1636         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1637         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1638
1639         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1640     }
1641     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1642                             REF_COST( 0, i_ref ) +
1643                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1644     if( h->mb.b_chroma_me )
1645         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1646 }
1647
1648 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1649 {
1650     /* Assumes that fdec still contains the results of
1651      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1652
1653     pixel *p_fenc = h->mb.pic.p_fenc[0];
1654     pixel *p_fdec = h->mb.pic.p_fdec[0];
1655
1656     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1657     if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1658         for( int i = 0; i < 4; i++ )
1659         {
1660             const int x = (i&1)*8;
1661             const int y = (i>>1)*8;
1662             a->i_cost16x16direct +=
1663             a->i_cost8x8direct[i] =
1664                 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1665
1666             /* mb type cost */
1667             a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1668         }
1669     else
1670         a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1671 }
1672
1673 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1674 {
1675     ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
1676     ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
1677     pixel *src0, *src1;
1678     int stride0 = 16, stride1 = 16;
1679     int i_ref, i_mvc;
1680     ALIGNED_4( int16_t mvc[9][2] );
1681     int try_skip = a->b_try_skip;
1682     int list1_skipped = 0;
1683     int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1684     int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1685                                 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1686
1687     x264_me_t m;
1688     m.i_pixel = PIXEL_16x16;
1689
1690     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1691
1692     /* 16x16 Search on list 0 and list 1 */
1693     a->l0.me16x16.cost = INT_MAX;
1694     a->l1.me16x16.cost = INT_MAX;
1695     for( int l = 1; l >= 0; )
1696     {
1697         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1698
1699         /* This loop is extremely munged in order to facilitate the following order of operations,
1700          * necessary for an efficient fast skip.
1701          * 1.  Search list1 ref0.
1702          * 2.  Search list0 ref0.
1703          * 3.  Try skip.
1704          * 4.  Search the rest of list0.
1705          * 5.  Go back and finish list1.
1706          */
1707         for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1708         {
1709             if( try_skip && l == 1 && i_ref > 0 )
1710             {
1711                 list1_skipped = 1;
1712                 break;
1713             }
1714
1715             m.i_ref_cost = REF_COST( l, i_ref );
1716
1717             /* search with ref */
1718             LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1719             x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1720             x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1721             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1722
1723             /* add ref cost */
1724             m.cost += m.i_ref_cost;
1725
1726             if( m.cost < lX->me16x16.cost )
1727                 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1728
1729             /* save mv for predicting neighbors */
1730             CP32( lX->mvc[i_ref][0], m.mv );
1731             CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1732
1733             /* Fast skip detection. */
1734             if( i_ref == 0 && try_skip )
1735             {
1736                 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1737                     abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1738                 {
1739                     try_skip = 0;
1740                 }
1741                 else if( !l )
1742                 {
1743                     /* We already tested skip */
1744                     h->mb.i_type = B_SKIP;
1745                     x264_analyse_update_cache( h, a );
1746                     return;
1747                 }
1748             }
1749         }
1750         if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1751             break;
1752         if( list1_skipped && l == 0 )
1753             l = 1;
1754         else
1755             l--;
1756     }
1757
1758     /* get cost of BI mode */
1759     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1760     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1761     int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1762     src0 = h->mc.get_ref( pix0, &stride0,
1763                           h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1764                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1765     src1 = h->mc.get_ref( pix1, &stride1,
1766                           h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1767                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1768
1769     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1770
1771     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1772                      + ref_costs
1773                      + a->l0.bi16x16.cost_mv
1774                      + a->l1.bi16x16.cost_mv;
1775
1776     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1777     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1778     {
1779         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1780                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1781         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1782                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1783         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1784                                 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1785                                 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1786         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1787                    + ref_costs + l0_mv_cost + l1_mv_cost;
1788         if( cost00 < a->i_cost16x16bi )
1789         {
1790             M32( a->l0.bi16x16.mv ) = 0;
1791             M32( a->l1.bi16x16.mv ) = 0;
1792             a->l0.bi16x16.cost_mv = l0_mv_cost;
1793             a->l1.bi16x16.cost_mv = l1_mv_cost;
1794             a->i_cost16x16bi = cost00;
1795         }
1796     }
1797
1798     /* mb type cost */
1799     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1800     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1801     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1802 }
1803
1804 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1805 {
1806     int x = 2*(i&1);
1807     int y = i&2;
1808
1809     switch( h->mb.i_sub_partition[i] )
1810     {
1811         case D_L0_8x8:
1812             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1813             break;
1814         case D_L0_8x4:
1815             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1816             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1817             break;
1818         case D_L0_4x8:
1819             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1820             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1821             break;
1822         case D_L0_4x4:
1823             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1824             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1825             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1826             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1827             break;
1828         default:
1829             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1830             break;
1831     }
1832 }
1833
1834 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1835 {
1836     int x = 2*(idx&1);
1837     int y = idx&2;
1838     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1839     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1840     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1841     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1842 }
1843
1844 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1845     if( x264_mb_partition_listX_table[0][part] ) \
1846     { \
1847         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1848         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1849     } \
1850     else \
1851     { \
1852         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1853         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1854         if( b_mvd ) \
1855             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1856     } \
1857     if( x264_mb_partition_listX_table[1][part] ) \
1858     { \
1859         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1860         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1861     } \
1862     else \
1863     { \
1864         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1865         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1866         if( b_mvd ) \
1867             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1868     }
1869
1870 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1871 {
1872     int x = 2*(i&1);
1873     int y = i&2;
1874     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1875     {
1876         x264_mb_load_mv_direct8x8( h, i );
1877         if( b_mvd )
1878         {
1879             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1880             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1881             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1882         }
1883     }
1884     else
1885     {
1886         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1887     }
1888 }
1889 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1890 {
1891     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1892 }
1893 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1894 {
1895     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1896 }
1897 #undef CACHE_MV_BI
1898
1899 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1900 {
1901     ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
1902     int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1903
1904     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1905      * than those used by the neighbors */
1906     #define CHECK_NEIGHBOUR(i)\
1907     {\
1908         int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1909         if( ref > i_maxref[l] )\
1910             i_maxref[l] = ref;\
1911     }
1912
1913     for( int l = 0; l < 2; l++ )
1914     {
1915         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1916         if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1917             h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1918         {
1919             i_maxref[l] = 0;
1920             CHECK_NEIGHBOUR(  -8 - 1 );
1921             CHECK_NEIGHBOUR(  -8 + 0 );
1922             CHECK_NEIGHBOUR(  -8 + 2 );
1923             CHECK_NEIGHBOUR(  -8 + 4 );
1924             CHECK_NEIGHBOUR(   0 - 1 );
1925             CHECK_NEIGHBOUR( 2*8 - 1 );
1926         }
1927     }
1928
1929     /* XXX Needed for x264_mb_predict_mv */
1930     h->mb.i_partition = D_8x8;
1931
1932     a->i_cost8x8bi = 0;
1933
1934     for( int i = 0; i < 4; i++ )
1935     {
1936         int x8 = i&1;
1937         int y8 = i>>1;
1938         int i_part_cost;
1939         int i_part_cost_bi;
1940         int stride[2] = {8,8};
1941         pixel *src[2];
1942         x264_me_t m;
1943         m.i_pixel = PIXEL_8x8;
1944         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1945
1946         for( int l = 0; l < 2; l++ )
1947         {
1948             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1949
1950             lX->me8x8[i].cost = INT_MAX;
1951             for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1952             {
1953                 m.i_ref_cost = REF_COST( l, i_ref );;
1954
1955                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1956
1957                 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1958                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1959                 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1960                 m.cost += m.i_ref_cost;
1961
1962                 if( m.cost < lX->me8x8[i].cost )
1963                 {
1964                     h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1965                     a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
1966                 }
1967
1968                 /* save mv for predicting other partitions within this MB */
1969                 CP32( lX->mvc[i_ref][i+1], m.mv );
1970             }
1971         }
1972
1973         /* BI mode */
1974         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1975                                 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1976         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1977                                 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1978         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1979                                 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1980
1981         a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1982         i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
1983                          + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
1984                          + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1985
1986         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1987         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1988
1989         i_part_cost = a->l0.me8x8[i].cost;
1990         h->mb.i_sub_partition[i] = D_L0_8x8;
1991         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1992         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1993         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1994         a->i_cost8x8bi += i_part_cost;
1995
1996         /* XXX Needed for x264_mb_predict_mv */
1997         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1998     }
1999
2000     /* mb type cost */
2001     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2002 }
2003
2004 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2005 {
2006     pixel **p_fref[2] =
2007         { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2008           h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2009     ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
2010
2011     /* XXX Needed for x264_mb_predict_mv */
2012     h->mb.i_partition = D_8x8;
2013
2014     a->i_cost8x8bi = 0;
2015
2016     for( int i = 0; i < 4; i++ )
2017     {
2018         int x8 = i&1;
2019         int y8 = i>>1;
2020         int i_part_cost;
2021         int i_part_cost_bi = 0;
2022         int stride[2] = {8,8};
2023         pixel *src[2];
2024
2025         for( int l = 0; l < 2; l++ )
2026         {
2027             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2028             x264_me_t *m = &lX->me8x8[i];
2029             m->i_pixel = PIXEL_8x8;
2030             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2031
2032             m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2033             m->i_ref = lX->me16x16.i_ref;
2034
2035             LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2036
2037             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2038             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2039             x264_me_search( h, m, &lX->me16x16.mv, 1 );
2040             a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2041             m->cost += m->i_ref_cost;
2042
2043             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2044
2045             /* save mv for predicting other partitions within this MB */
2046             CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2047
2048             /* BI mode */
2049             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2050                                     m->mv[0], m->mv[1], 8, 8, weight_none );
2051             i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2052         }
2053         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2054         a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2055         i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2056         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2057         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2058
2059         i_part_cost = a->l0.me8x8[i].cost;
2060         h->mb.i_sub_partition[i] = D_L0_8x8;
2061         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2062         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2063         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2064         a->i_cost8x8bi += i_part_cost;
2065
2066         /* XXX Needed for x264_mb_predict_mv */
2067         x264_mb_cache_mv_b8x8( h, a, i, 0 );
2068     }
2069
2070     /* mb type cost */
2071     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2072 }
2073
2074 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2075 {
2076     ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
2077     ALIGNED_4( int16_t mvc[3][2] );
2078
2079     h->mb.i_partition = D_16x8;
2080     a->i_cost16x8bi = 0;
2081
2082     for( int i = 0; i < 2; i++ )
2083     {
2084         int i_part_cost;
2085         int i_part_cost_bi = 0;
2086         int stride[2] = {16,16};
2087         pixel *src[2];
2088         x264_me_t m;
2089         m.i_pixel = PIXEL_16x8;
2090         LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2091
2092         for( int l = 0; l < 2; l++ )
2093         {
2094             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2095             int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2096             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2097             lX->me16x8[i].cost = INT_MAX;
2098             for( int j = 0; j < i_ref8s; j++ )
2099             {
2100                 int i_ref = ref8[j];
2101                 m.i_ref_cost = REF_COST( l, i_ref );;
2102
2103                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2104
2105                 CP32( mvc[0], lX->mvc[i_ref][0] );
2106                 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2107                 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2108
2109                 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2110                 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2111                 x264_me_search( h, &m, mvc, 3 );
2112                 m.cost += m.i_ref_cost;
2113
2114                 if( m.cost < lX->me16x8[i].cost )
2115                     h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2116             }
2117         }
2118
2119         /* BI mode */
2120         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2121                                 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2122         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2123                                 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2124         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2125                                 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2126
2127         i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2128                         + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2129                         + a->l1.me16x8[i].i_ref_cost;
2130
2131         i_part_cost = a->l0.me16x8[i].cost;
2132         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2133
2134         if( a->l1.me16x8[i].cost < i_part_cost )
2135         {
2136             i_part_cost = a->l1.me16x8[i].cost;
2137             a->i_mb_partition16x8[i] = D_L1_8x8;
2138         }
2139         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2140         {
2141             i_part_cost = i_part_cost_bi;
2142             a->i_mb_partition16x8[i] = D_BI_8x8;
2143         }
2144         a->i_cost16x8bi += i_part_cost;
2145
2146         /* Early termination based on the current SATD score of partition[0]
2147            plus the estimated SATD score of partition[1] */
2148         if( !i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2149             * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2150         {
2151             a->i_cost16x8bi = COST_MAX;
2152             return;
2153         }
2154
2155         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2156     }
2157
2158     /* mb type cost */
2159     a->i_mb_type16x8 = B_L0_L0
2160         + (a->i_mb_partition16x8[0]>>2) * 3
2161         + (a->i_mb_partition16x8[1]>>2);
2162     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2163 }
2164
2165 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2166 {
2167     ALIGNED_ARRAY_8( pixel, pix,[2],[8*16] );
2168     ALIGNED_4( int16_t mvc[3][2] );
2169
2170     h->mb.i_partition = D_8x16;
2171     a->i_cost8x16bi = 0;
2172
2173     for( int i = 0; i < 2; i++ )
2174     {
2175         int i_part_cost;
2176         int i_part_cost_bi = 0;
2177         int stride[2] = {8,8};
2178         pixel *src[2];
2179         x264_me_t m;
2180         m.i_pixel = PIXEL_8x16;
2181         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2182
2183         for( int l = 0; l < 2; l++ )
2184         {
2185             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2186             int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2187             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2188             lX->me8x16[i].cost = INT_MAX;
2189             for( int j = 0; j < i_ref8s; j++ )
2190             {
2191                 int i_ref = ref8[j];
2192                 m.i_ref_cost = REF_COST( l, i_ref );
2193
2194                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2195
2196                 CP32( mvc[0], lX->mvc[i_ref][0] );
2197                 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2198                 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2199
2200                 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2201                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2202                 x264_me_search( h, &m, mvc, 3 );
2203                 m.cost += m.i_ref_cost;
2204
2205                 if( m.cost < lX->me8x16[i].cost )
2206                     h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2207             }
2208         }
2209
2210         /* BI mode */
2211         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2212                                 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2213         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2214                                 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2215         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2216
2217         i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2218                         + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2219                         + a->l1.me8x16[i].i_ref_cost;
2220
2221         i_part_cost = a->l0.me8x16[i].cost;
2222         a->i_mb_partition8x16[i] = D_L0_8x8;
2223
2224         if( a->l1.me8x16[i].cost < i_part_cost )
2225         {
2226             i_part_cost = a->l1.me8x16[i].cost;
2227             a->i_mb_partition8x16[i] = D_L1_8x8;
2228         }
2229         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2230         {
2231             i_part_cost = i_part_cost_bi;
2232             a->i_mb_partition8x16[i] = D_BI_8x8;
2233         }
2234         a->i_cost8x16bi += i_part_cost;
2235
2236         /* Early termination based on the current SATD score of partition[0]
2237            plus the estimated SATD score of partition[1] */
2238         if( !i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2239             * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2240         {
2241             a->i_cost8x16bi = COST_MAX;
2242             return;
2243         }
2244
2245         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2246     }
2247
2248     /* mb type cost */
2249     a->i_mb_type8x16 = B_L0_L0
2250         + (a->i_mb_partition8x16[0]>>2) * 3
2251         + (a->i_mb_partition8x16[1]>>2);
2252     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2253 }
2254
2255 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2256 {
2257     int thresh = i_satd * 5/4;
2258
2259     h->mb.i_type = P_L0;
2260     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2261     {
2262         h->mb.i_partition = D_16x16;
2263         x264_analyse_update_cache( h, a );
2264         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2265     }
2266
2267     if( a->l0.i_cost16x8 <= thresh )
2268     {
2269         h->mb.i_partition = D_16x8;
2270         x264_analyse_update_cache( h, a );
2271         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2272     }
2273     else
2274         a->l0.i_cost16x8 = COST_MAX;
2275
2276     if( a->l0.i_cost8x16 <= thresh )
2277     {
2278         h->mb.i_partition = D_8x16;
2279         x264_analyse_update_cache( h, a );
2280         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2281     }
2282     else
2283         a->l0.i_cost8x16 = COST_MAX;
2284
2285     if( a->l0.i_cost8x8 <= thresh )
2286     {
2287         h->mb.i_type = P_8x8;
2288         h->mb.i_partition = D_8x8;
2289         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2290         {
2291             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2292             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2293             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2294             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2295             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2296              * for future blocks are those left over from previous RDO calls. */
2297             for( int i = 0; i < 4; i++ )
2298             {
2299                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2300                 int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2301                 int subtype, btype = D_L0_8x8;
2302                 uint64_t bcost = COST_MAX64;
2303                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2304                 {
2305                     uint64_t cost;
2306                     if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2307                         continue;
2308                     h->mb.i_sub_partition[i] = subtype;
2309                     x264_mb_cache_mv_p8x8( h, a, i );
2310                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2311                     COPY2_IF_LT( bcost, cost, btype, subtype );
2312                 }
2313                 if( h->mb.i_sub_partition[i] != btype )
2314                 {
2315                     h->mb.i_sub_partition[i] = btype;
2316                     x264_mb_cache_mv_p8x8( h, a, i );
2317                 }
2318             }
2319         }
2320         else
2321             x264_analyse_update_cache( h, a );
2322         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2323     }
2324     else
2325         a->l0.i_cost8x8 = COST_MAX;
2326 }
2327
2328 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2329 {
2330     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2331
2332     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2333     {
2334         h->mb.i_type = B_DIRECT;
2335         /* Assumes direct/skip MC is still in fdec */
2336         /* Requires b-rdo to be done before intra analysis */
2337         h->mb.b_skip_mc = 1;
2338         x264_analyse_update_cache( h, a );
2339         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2340         h->mb.b_skip_mc = 0;
2341     }
2342
2343     //FIXME not all the update_cache calls are needed
2344     h->mb.i_partition = D_16x16;
2345     /* L0 */
2346     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2347     {
2348         h->mb.i_type = B_L0_L0;
2349         x264_analyse_update_cache( h, a );
2350         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2351     }
2352
2353     /* L1 */
2354     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2355     {
2356         h->mb.i_type = B_L1_L1;
2357         x264_analyse_update_cache( h, a );
2358         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2359     }
2360
2361     /* BI */
2362     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2363     {
2364         h->mb.i_type = B_BI_BI;
2365         x264_analyse_update_cache( h, a );
2366         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2367     }
2368
2369     /* 8x8 */
2370     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2371     {
2372         h->mb.i_type = B_8x8;
2373         h->mb.i_partition = D_8x8;
2374         x264_analyse_update_cache( h, a );
2375         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2376         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2377     }
2378
2379     /* 16x8 */
2380     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2381     {
2382         h->mb.i_type = a->i_mb_type16x8;
2383         h->mb.i_partition = D_16x8;
2384         x264_analyse_update_cache( h, a );
2385         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2386     }
2387
2388     /* 8x16 */
2389     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2390     {
2391         h->mb.i_type = a->i_mb_type8x16;
2392         h->mb.i_partition = D_8x16;
2393         x264_analyse_update_cache( h, a );
2394         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2395     }
2396 }
2397
2398 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2399 {
2400     int i_biweight;
2401
2402     if( IS_INTRA(h->mb.i_type) )
2403         return;
2404
2405     switch( h->mb.i_partition )
2406     {
2407         case D_16x16:
2408             if( h->mb.i_type == B_BI_BI )
2409             {
2410                 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2411                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2412             }
2413             break;
2414         case D_16x8:
2415             for( int i = 0; i < 2; i++ )
2416                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2417                 {
2418                     i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2419                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2420                 }
2421             break;
2422         case D_8x16:
2423             for( int i = 0; i < 2; i++ )
2424                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2425                 {
2426                     i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2427                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2428                 }
2429             break;
2430         case D_8x8:
2431             for( int i = 0; i < 4; i++ )
2432                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2433                 {
2434                     i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2435                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2436                 }
2437             break;
2438     }
2439 }
2440
2441 static inline void x264_mb_analyse_transform( x264_t *h )
2442 {
2443     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2444     {
2445         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2446         x264_mb_mc( h );
2447
2448         int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2449                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2450         int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2451                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2452
2453         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2454         h->mb.b_skip_mc = 1;
2455     }
2456 }
2457
2458 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2459 {
2460     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2461     {
2462         x264_analyse_update_cache( h, a );
2463         h->mb.b_transform_8x8 ^= 1;
2464         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2465         int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2466
2467         if( *i_rd >= i_rd8 )
2468         {
2469             if( *i_rd > 0 )
2470                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2471             *i_rd = i_rd8;
2472         }
2473         else
2474             h->mb.b_transform_8x8 ^= 1;
2475     }
2476 }
2477
2478 /* Rate-distortion optimal QP selection.
2479  * FIXME: More than half of the benefit of this function seems to be
2480  * in the way it improves the coding of chroma DC (by decimating or
2481  * finding a better way to code a single DC coefficient.)
2482  * There must be a more efficient way to get that portion of the benefit
2483  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2484  * trick. */
2485 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2486 {
2487     int bcost, cost, failures, prevcost, origcost;
2488     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2489     int last_qp_tried = 0;
2490     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2491     int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2492
2493     /* If CBP is already zero, don't raise the quantizer any higher. */
2494     for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2495     {
2496         /* Without psy-RD, require monotonicity when moving quant away from previous
2497          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2498          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2499          * allow 2 failures when moving quant towards previous quant.
2500          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2501         int threshold = (!!h->mb.i_psy_rd);
2502         /* Raise the threshold for failures if we're moving towards the last QP. */
2503         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2504             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2505             threshold++;
2506         h->mb.i_qp = orig_qp;
2507         failures = 0;
2508         prevcost = origcost;
2509
2510         /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2511          * (up to a point) will too.  So, jump down to where the threshold will kick in
2512          * and check the QP there.  If the CBP is still empty, skip the main loop.
2513          * If it isn't empty, we would have ended up having to check this QP anyways,
2514          * so as long as we store it for later lookup, we lose nothing. */
2515         int already_checked_qp = -1;
2516         int already_checked_cost = COST_MAX;
2517         if( direction == -1 )
2518         {
2519             if( !origcbp )
2520             {
2521                 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2522                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2523                 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2524                 if( !h->mb.cbp[h->mb.i_mb_xy] )
2525                 {
2526                     /* If our empty-CBP block is lower QP than the last QP,
2527                      * the last QP almost surely doesn't have a CBP either. */
2528                     if( h->mb.i_last_qp > h->mb.i_qp )
2529                         last_qp_tried = 1;
2530                     break;
2531                 }
2532                 already_checked_qp = h->mb.i_qp;
2533                 h->mb.i_qp = orig_qp;
2534             }
2535         }
2536
2537         h->mb.i_qp += direction;
2538         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2539         {
2540             if( h->mb.i_last_qp == h->mb.i_qp )
2541                 last_qp_tried = 1;
2542             if( h->mb.i_qp == already_checked_qp )
2543                 cost = already_checked_cost;
2544             else
2545             {
2546                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2547                 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2548                 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2549             }
2550
2551             /* We can't assume that the costs are monotonic over QPs.
2552              * Tie case-as-failure seems to give better results. */
2553             if( cost < prevcost )
2554                 failures = 0;
2555             else
2556                 failures++;
2557             prevcost = cost;
2558
2559             if( failures > threshold )
2560                 break;
2561             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2562                 break;
2563             h->mb.i_qp += direction;
2564         }
2565     }
2566
2567     /* Always try the last block's QP. */
2568     if( !last_qp_tried )
2569     {
2570         h->mb.i_qp = h->mb.i_last_qp;
2571         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2572         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2573         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2574     }
2575
2576     h->mb.i_qp = bqp;
2577     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2578
2579     /* Check transform again; decision from before may no longer be optimal. */
2580     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2581         x264_mb_transform_8x8_allowed( h ) )
2582     {
2583         h->mb.b_transform_8x8 ^= 1;
2584         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2585         if( cost > bcost )
2586             h->mb.b_transform_8x8 ^= 1;
2587     }
2588 }
2589
2590 /*****************************************************************************
2591  * x264_macroblock_analyse:
2592  *****************************************************************************/
2593 void x264_macroblock_analyse( x264_t *h )
2594 {
2595     x264_mb_analysis_t analysis;
2596     int i_cost = COST_MAX;
2597
2598     h->mb.i_qp = x264_ratecontrol_mb_qp( h );
2599     /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2600      * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2601     if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2602         h->mb.i_qp = h->mb.i_last_qp;
2603
2604     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2605
2606     /*--------------------------- Do the analysis ---------------------------*/
2607     if( h->sh.i_type == SLICE_TYPE_I )
2608     {
2609 intra_analysis:
2610         if( analysis.i_mbrd )
2611             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2612         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2613         if( analysis.i_mbrd )
2614             x264_intra_rd( h, &analysis, COST_MAX );
2615
2616         i_cost = analysis.i_satd_i16x16;
2617         h->mb.i_type = I_16x16;
2618         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2619         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2620         if( analysis.i_satd_pcm < i_cost )
2621             h->mb.i_type = I_PCM;
2622
2623         else if( analysis.i_mbrd >= 2 )
2624             x264_intra_rd_refine( h, &analysis );
2625     }
2626     else if( h->sh.i_type == SLICE_TYPE_P )
2627     {
2628         int b_skip = 0;
2629
2630         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2631
2632         analysis.b_try_skip = 0;
2633         if( analysis.b_force_intra )
2634         {
2635             if( !h->param.analyse.b_psy )
2636             {
2637                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2638                 goto intra_analysis;
2639             }
2640         }
2641         else
2642         {
2643             /* Fast P_SKIP detection */
2644             if( h->param.analyse.b_fast_pskip )
2645             {
2646                 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2647                     // FIXME don't need to check this if the reference frame is done
2648                     {}
2649                 else if( h->param.analyse.i_subpel_refine >= 3 )
2650                     analysis.b_try_skip = 1;
2651                 else if( h->mb.i_mb_type_left == P_SKIP ||
2652                          h->mb.i_mb_type_top == P_SKIP ||
2653                          h->mb.i_mb_type_topleft == P_SKIP ||
2654                          h->mb.i_mb_type_topright == P_SKIP )
2655                     b_skip = x264_macroblock_probe_pskip( h );
2656             }
2657         }
2658
2659         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2660
2661         if( b_skip )
2662         {
2663             h->mb.i_type = P_SKIP;
2664             h->mb.i_partition = D_16x16;
2665             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2666             /* Set up MVs for future predictors */
2667             for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2668                 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2669         }
2670         else
2671         {
2672             const unsigned int flags = h->param.analyse.inter;
2673             int i_type;
2674             int i_partition;
2675             int i_thresh16x8;
2676             int i_satd_inter, i_satd_intra;
2677
2678             x264_mb_analyse_load_costs( h, &analysis );
2679
2680             x264_mb_analyse_inter_p16x16( h, &analysis );
2681
2682             if( h->mb.i_type == P_SKIP )
2683             {
2684                 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2685                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2686                 return;
2687             }
2688
2689             if( flags & X264_ANALYSE_PSUB16x16 )
2690             {
2691                 if( h->param.analyse.b_mixed_references )
2692                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2693                 else
2694                     x264_mb_analyse_inter_p8x8( h, &analysis );
2695             }
2696
2697             /* Select best inter mode */
2698             i_type = P_L0;
2699             i_partition = D_16x16;
2700             i_cost = analysis.l0.me16x16.cost;
2701
2702             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2703                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2704             {
2705                 i_type = P_8x8;
2706                 i_partition = D_8x8;
2707                 i_cost = analysis.l0.i_cost8x8;
2708
2709                 /* Do sub 8x8 */
2710                 if( flags & X264_ANALYSE_PSUB8x8 )
2711                 {
2712                     for( int i = 0; i < 4; i++ )
2713                     {
2714                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2715                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2716                         {
2717                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2718                             h->mb.i_sub_partition[i] = D_L0_4x4;
2719
2720                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2721                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2722                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2723
2724                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2725                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2726                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2727
2728                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2729                         }
2730                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2731                     }
2732                     analysis.l0.i_cost8x8 = i_cost;
2733                 }
2734             }
2735
2736             /* Now do 16x8/8x16 */
2737             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2738             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2739                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2740             {
2741                 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
2742                                       + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2743                 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2744
2745                 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
2746                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2747
2748                 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
2749                                   + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2750                 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2751
2752                 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
2753                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2754             }
2755
2756             h->mb.i_partition = i_partition;
2757
2758             /* refine qpel */
2759             //FIXME mb_type costs?
2760             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2761             {
2762                 /* refine later */
2763             }
2764             else if( i_partition == D_16x16 )
2765             {
2766                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2767                 i_cost = analysis.l0.me16x16.cost;
2768             }
2769             else if( i_partition == D_16x8 )
2770             {
2771                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2772                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2773                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2774             }
2775             else if( i_partition == D_8x16 )
2776             {
2777                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2778                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2779                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2780             }
2781             else if( i_partition == D_8x8 )
2782             {
2783                 i_cost = 0;
2784                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2785                 {
2786                     switch( h->mb.i_sub_partition[i8x8] )
2787                     {
2788                         case D_L0_8x8:
2789                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2790                             i_cost += analysis.l0.me8x8[i8x8].cost;
2791                             break;
2792                         case D_L0_8x4:
2793                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2794                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2795                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2796                                       analysis.l0.me8x4[i8x8][1].cost;
2797                             break;
2798                         case D_L0_4x8:
2799                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2800                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2801                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2802                                       analysis.l0.me4x8[i8x8][1].cost;
2803                             break;
2804
2805                         case D_L0_4x4:
2806                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2807                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2808                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2809                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2810                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2811                                       analysis.l0.me4x4[i8x8][1].cost +
2812                                       analysis.l0.me4x4[i8x8][2].cost +
2813                                       analysis.l0.me4x4[i8x8][3].cost;
2814                             break;
2815                         default:
2816                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2817                             break;
2818                     }
2819                 }
2820             }
2821
2822             if( h->mb.b_chroma_me )
2823             {
2824                 x264_mb_analyse_intra_chroma( h, &analysis );
2825                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2826                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2827                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2828                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2829             }
2830             else
2831                 x264_mb_analyse_intra( h, &analysis, i_cost );
2832
2833             i_satd_inter = i_cost;
2834             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2835                                       analysis.i_satd_i8x8,
2836                                       analysis.i_satd_i4x4 );
2837
2838             if( analysis.i_mbrd )
2839             {
2840                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2841                 i_type = P_L0;
2842                 i_partition = D_16x16;
2843                 i_cost = analysis.l0.i_rd16x16;
2844                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2845                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2846                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2847                 h->mb.i_type = i_type;
2848                 h->mb.i_partition = i_partition;
2849                 if( i_cost < COST_MAX )
2850                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2851                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2852             }
2853
2854             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2855             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2856             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2857             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2858
2859             h->mb.i_type = i_type;
2860
2861             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2862             {
2863                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2864                  * it was an inter block. */
2865                 x264_analyse_update_cache( h, &analysis );
2866                 x264_macroblock_encode( h );
2867                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2868                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2869                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2870                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2871                 goto intra_analysis;
2872             }
2873
2874             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2875             {
2876                 if( IS_INTRA( h->mb.i_type ) )
2877                 {
2878                     x264_intra_rd_refine( h, &analysis );
2879                 }
2880                 else if( i_partition == D_16x16 )
2881                 {
2882                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2883                     analysis.l0.me16x16.cost = i_cost;
2884                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2885                 }
2886                 else if( i_partition == D_16x8 )
2887                 {
2888                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2889                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2890                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2891                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2892                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2893                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2894                 }
2895                 else if( i_partition == D_8x16 )
2896                 {
2897                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2898                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2899                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2900                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2901                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2902                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2903                 }
2904                 else if( i_partition == D_8x8 )
2905                 {
2906                     x264_analyse_update_cache( h, &analysis );
2907                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2908                     {
2909                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2910                         {
2911                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2912                         }
2913                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2914                         {
2915                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2916                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2917                         }
2918                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2919                         {
2920                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2921                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2922                         }
2923                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2924                         {
2925                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2926                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2927                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2928                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2929                         }
2930                     }
2931                 }
2932             }
2933         }
2934     }
2935     else if( h->sh.i_type == SLICE_TYPE_B )
2936     {
2937         int i_bskip_cost = COST_MAX;
2938         int b_skip = 0;
2939
2940         if( analysis.i_mbrd )
2941             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2942
2943         h->mb.i_type = B_SKIP;
2944         if( h->mb.b_direct_auto_write )
2945         {
2946             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2947             for( int i = 0; i < 2; i++ )
2948             {
2949                 int b_changed = 1;
2950                 h->sh.b_direct_spatial_mv_pred ^= 1;
2951                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2952                 if( analysis.b_direct_available )
2953                 {
2954                     if( b_changed )
2955                     {
2956                         x264_mb_mc( h );
2957                         b_skip = x264_macroblock_probe_bskip( h );
2958                     }
2959                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2960                 }
2961                 else
2962                     b_skip = 0;
2963             }
2964         }
2965         else
2966             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2967
2968         analysis.b_try_skip = 0;
2969         if( analysis.b_direct_available )
2970         {
2971             if( !h->mb.b_direct_auto_write )
2972                 x264_mb_mc( h );
2973             if( analysis.i_mbrd )
2974             {
2975                 i_bskip_cost = ssd_mb( h );
2976                 /* 6 = minimum cavlc cost of a non-skipped MB */
2977                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2978             }
2979             else if( !h->mb.b_direct_auto_write )
2980             {
2981                 /* Conditioning the probe on neighboring block types
2982                  * doesn't seem to help speed or quality. */
2983                 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2984                 if( h->param.analyse.i_subpel_refine < 3 )
2985                     b_skip = analysis.b_try_skip;
2986             }
2987             /* Set up MVs for future predictors */
2988             if( b_skip )
2989             {
2990                 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2991                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2992                 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
2993                     M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2994             }
2995         }
2996
2997         if( !b_skip )
2998         {
2999             const unsigned int flags = h->param.analyse.inter;
3000             int i_type;
3001             int i_partition;
3002             int i_satd_inter;
3003             h->mb.b_skip_mc = 0;
3004             h->mb.i_type = B_DIRECT;
3005
3006             x264_mb_analyse_load_costs( h, &analysis );
3007
3008             /* select best inter mode */
3009             /* direct must be first */
3010             if( analysis.b_direct_available )
3011                 x264_mb_analyse_inter_direct( h, &analysis );
3012
3013             x264_mb_analyse_inter_b16x16( h, &analysis );
3014
3015             if( h->mb.i_type == B_SKIP )
3016             {
3017                 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3018                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3019                 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3020                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3021                 return;
3022             }
3023
3024             i_type = B_L0_L0;
3025             i_partition = D_16x16;
3026             i_cost = analysis.l0.me16x16.cost;
3027             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3028             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3029             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3030
3031             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
3032             {
3033                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3034                 if( i_bskip_cost < analysis.i_rd16x16direct &&
3035                     i_bskip_cost < analysis.i_rd16x16bi &&
3036                     i_bskip_cost < analysis.l0.i_rd16x16 &&
3037                     i_bskip_cost < analysis.l1.i_rd16x16 )
3038                 {
3039                     h->mb.i_type = B_SKIP;
3040                     x264_analyse_update_cache( h, &analysis );
3041                     return;
3042                 }
3043             }
3044
3045             if( flags & X264_ANALYSE_BSUB16x16 )
3046             {
3047                 if( h->param.analyse.b_mixed_references )
3048                     x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3049                 else
3050                     x264_mb_analyse_inter_b8x8( h, &analysis );
3051
3052                 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3053
3054                 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3055                 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3056                 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3057                 for( int i = 0; i < 2; i++ )
3058                 {
3059                     int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3060                     int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3061                     // 16x8
3062                     i_best_cost = COST_MAX;
3063                     i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3064                     i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3065                     i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3066                     avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3067                                          + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3068                     avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3069                                          + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3070                     COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3071                     COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3072                     COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3073                     analysis.i_cost_est16x8[i] = i_best_cost;
3074
3075                     // 8x16
3076                     i_best_cost = COST_MAX;
3077                     i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3078                     i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3079                     i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3080                     avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3081                                          + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3082                     avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3083                                          + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3084                     COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3085                     COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3086                     COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3087                     analysis.i_cost_est8x16[i] = i_best_cost;
3088                 }
3089                 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3090                 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3091                 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3092                 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3093                 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3094                 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3095
3096                 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3097                 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3098                 if( try_16x8_first && i_cost_est16x8bi_total < i_cost )
3099                 {
3100                     x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3101                     COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3102                 }
3103                 if( i_cost_est8x16bi_total < i_cost )
3104                 {
3105                     x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3106                     COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3107                 }
3108                 if( !try_16x8_first && i_cost_est16x8bi_total < i_cost )
3109                 {
3110                     x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3111                     COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3112                 }
3113             }
3114
3115             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3116             {
3117                 /* refine later */
3118             }
3119             /* refine qpel */
3120             else if( i_partition == D_16x16 )
3121             {
3122                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3123                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3124                 if( i_type == B_L0_L0 )
3125                 {
3126                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3127                     i_cost = analysis.l0.me16x16.cost
3128                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3129                 }
3130                 else if( i_type == B_L1_L1 )
3131                 {
3132                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3133                     i_cost = analysis.l1.me16x16.cost
3134                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3135                 }
3136                 else if( i_type == B_BI_BI )
3137                 {
3138                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3139                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3140                 }
3141             }
3142             else if( i_partition == D_16x8 )
3143             {
3144                 for( int i = 0; i < 2; i++ )
3145                 {
3146                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3147                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3148                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3149                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3150                 }
3151             }
3152             else if( i_partition == D_8x16 )
3153             {
3154                 for( int i = 0; i < 2; i++ )
3155                 {
3156                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3157                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3158                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3159                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3160                 }
3161             }
3162             else if( i_partition == D_8x8 )
3163             {
3164                 for( int i = 0; i < 4; i++ )
3165                 {
3166                     x264_me_t *m;
3167                     int i_part_cost_old;
3168                     int i_type_cost;
3169                     int i_part_type = h->mb.i_sub_partition[i];
3170                     int b_bidir = (i_part_type == D_BI_8x8);
3171
3172                     if( i_part_type == D_DIRECT_8x8 )
3173                         continue;
3174                     if( x264_mb_partition_listX_table[0][i_part_type] )
3175                     {
3176                         m = &analysis.l0.me8x8[i];
3177                         i_part_cost_old = m->cost;
3178                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3179                         m->cost -= i_type_cost;
3180                         x264_me_refine_qpel( h, m );
3181                         if( !b_bidir )
3182                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3183                     }
3184                     if( x264_mb_partition_listX_table[1][i_part_type] )
3185                     {
3186                         m = &analysis.l1.me8x8[i];
3187                         i_part_cost_old = m->cost;
3188                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3189                         m->cost -= i_type_cost;
3190                         x264_me_refine_qpel( h, m );
3191                         if( !b_bidir )
3192                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3193                     }
3194                     /* TODO: update mvp? */
3195                 }
3196             }
3197
3198             i_satd_inter = i_cost;
3199
3200             if( analysis.i_mbrd )
3201             {
3202                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3203                 i_type = B_SKIP;
3204                 i_cost = i_bskip_cost;
3205                 i_partition = D_16x16;
3206                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3207                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3208                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3209                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3210                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3211                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3212                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3213
3214                 h->mb.i_type = i_type;
3215                 h->mb.i_partition = i_partition;
3216             }
3217
3218             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3219
3220             if( analysis.i_mbrd )
3221             {
3222                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3223                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3224             }
3225
3226             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3227             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3228             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3229             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3230
3231             h->mb.i_type = i_type;
3232             h->mb.i_partition = i_partition;
3233
3234             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3235                 x264_intra_rd_refine( h, &analysis );
3236             if( h->mb.i_subpel_refine >= 5 )
3237                 x264_refine_bidir( h, &analysis );
3238
3239             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3240             {
3241                 int i_biweight;
3242                 x264_analyse_update_cache( h, &analysis );
3243
3244                 if( i_partition == D_16x16 )
3245                 {
3246                     if( i_type == B_L0_L0 )
3247                     {
3248                         analysis.l0.me16x16.cost = i_cost;
3249                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3250                     }
3251                     else if( i_type == B_L1_L1 )
3252                     {
3253                         analysis.l1.me16x16.cost = i_cost;
3254                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3255                     }
3256                     else if( i_type == B_BI_BI )
3257                     {
3258                         i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3259                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3260                     }
3261                 }
3262                 else if( i_partition == D_16x8 )
3263                 {
3264                     for( int i = 0; i < 2; i++ )
3265                     {
3266                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3267                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3268                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3269                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3270                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3271                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3272                         {
3273                             i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3274                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3275                         }
3276                     }
3277                 }
3278                 else if( i_partition == D_8x16 )
3279                 {
3280                     for( int i = 0; i < 2; i++ )
3281                     {
3282                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3283                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3284                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3285                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3286                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3287                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3288                         {
3289                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3290                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3291                         }
3292                     }
3293                 }
3294                 else if( i_partition == D_8x8 )
3295                 {
3296                     for( int i = 0; i < 4; i++ )
3297                     {
3298                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3299                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3300                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3301                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3302                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3303                         {
3304                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3305                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3306                         }
3307                     }
3308                 }
3309             }
3310         }
3311     }
3312
3313     x264_analyse_update_cache( h, &analysis );
3314
3315     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3316      * without realizing it.  Check for this and account for it if necessary. */
3317     if( analysis.i_mbrd >= 2 )
3318     {
3319         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3320         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3321         int list = check_mv_lists[h->mb.i_type] - 1;
3322         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3323             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3324             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3325                 h->mb.i_partition = D_16x16;
3326     }
3327
3328     if( !analysis.i_mbrd )
3329         x264_mb_analyse_transform( h );
3330
3331     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3332         x264_mb_analyse_qp_rd( h, &analysis );
3333
3334     h->mb.b_trellis = h->param.analyse.i_trellis;
3335     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3336     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3337         x264_psy_trellis_init( h, 0 );
3338     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3339         h->mb.i_skip_intra = 0;
3340 }
3341
3342 /*-------------------- Update MB from the analysis ----------------------*/
3343 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3344 {
3345     switch( h->mb.i_type )
3346     {
3347         case I_4x4:
3348             for( int i = 0; i < 16; i++ )
3349                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3350
3351             x264_mb_analyse_intra_chroma( h, a );
3352             break;
3353         case I_8x8:
3354             for( int i = 0; i < 4; i++ )
3355                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3356
3357             x264_mb_analyse_intra_chroma( h, a );
3358             break;
3359         case I_16x16:
3360             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3361             x264_mb_analyse_intra_chroma( h, a );
3362             break;
3363
3364         case I_PCM:
3365             break;
3366
3367         case P_L0:
3368             switch( h->mb.i_partition )
3369             {
3370                 case D_16x16:
3371                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3372                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3373                     break;
3374
3375                 case D_16x8:
3376                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3377                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3378                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3379                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3380                     break;
3381
3382                 case D_8x16:
3383                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3384                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3385                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3386                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3387                     break;
3388
3389                 default:
3390                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3391                     break;
3392             }
3393             break;
3394
3395         case P_8x8:
3396             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3397             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3398             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3399             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3400             for( int i = 0; i < 4; i++ )
3401                 x264_mb_cache_mv_p8x8( h, a, i );
3402             break;
3403
3404         case P_SKIP:
3405         {
3406             h->mb.i_partition = D_16x16;
3407             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3408             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3409             break;
3410         }
3411
3412         case B_SKIP:
3413         case B_DIRECT:
3414             h->mb.i_partition = h->mb.cache.direct_partition;
3415             x264_mb_load_mv_direct8x8( h, 0 );
3416             x264_mb_load_mv_direct8x8( h, 1 );
3417             x264_mb_load_mv_direct8x8( h, 2 );
3418             x264_mb_load_mv_direct8x8( h, 3 );
3419             break;
3420
3421         case B_8x8:
3422             /* optimize: cache might not need to be rewritten */
3423             for( int i = 0; i < 4; i++ )
3424                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3425             break;
3426
3427         default: /* the rest of the B types */
3428             switch( h->mb.i_partition )
3429             {
3430             case D_16x16:
3431                 switch( h->mb.i_type )
3432                 {
3433                 case B_L0_L0:
3434                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3435                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3436
3437                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3438                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3439                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3440                     break;
3441                 case B_L1_L1:
3442                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3443                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3444                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3445
3446                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3447                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3448                     break;
3449                 case B_BI_BI:
3450                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3451                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3452
3453                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3454                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3455                     break;
3456                 }
3457                 break;
3458             case D_16x8:
3459                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3460                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3461                 break;
3462             case D_8x16:
3463                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3464                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3465                 break;
3466             default:
3467                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3468                 break;
3469             }
3470     }
3471
3472 #ifndef NDEBUG
3473     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3474     {
3475         for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3476         {
3477             int completed;
3478             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3479             if( ref < 0 )
3480                 continue;
3481             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3482             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3483             {
3484                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3485                 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3486                 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3487                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3488                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3489                 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3490                 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3491                 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3492                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3493                 x264_mb_analyse_intra( h, a, COST_MAX );
3494                 h->mb.i_type = I_16x16;
3495                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3496                 x264_mb_analyse_intra_chroma( h, a );
3497             }
3498         }
3499     }
3500 #endif
3501 }
3502
3503 #include "slicetype.c"
3504