git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int       i_rd16x16;
  41     x264_me_t me16x16;
  42     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  43
  44     /* 8x8 */
  45     int       i_cost8x8;
  46     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  47     ALIGNED_4( int16_t mvc[32][5][2] );
  48     x264_me_t me8x8[4];
  49
  50     /* Sub 4x4 */
  51     int       i_cost4x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me4x4[4][4];
  53
  54     /* Sub 8x4 */
  55     int       i_cost8x4[4]; /* cost per 8x8 partition */
  56     x264_me_t me8x4[4][2];
  57
  58     /* Sub 4x8 */
  59     int       i_cost4x8[4]; /* cost per 8x8 partition */
  60     x264_me_t me4x8[4][2];
  61
  62     /* 16x8 */
  63     int       i_cost16x8;
  64     x264_me_t me16x8[2];
  65
  66     /* 8x16 */
  67     int       i_cost8x16;
  68     x264_me_t me8x16[2];
  69
  70 } x264_mb_analysis_list_t;
  71
  72 typedef struct
  73 {
  74     /* conduct the analysis using this lamda and QP */
  75     int i_lambda;
  76     int i_lambda2;
  77     int i_qp;
  78     uint16_t *p_cost_mv;
  79     uint16_t *p_cost_ref[2];
  80     int i_mbrd;
  81
  82
  83     /* I: Intra part */
  84     /* Take some shortcuts in intra search if intra is deemed unlikely */
  85     int b_fast_intra;
  86     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  87     int b_try_skip;
  88
  89     /* Luma part */
  90     int i_satd_i16x16;
  91     int i_satd_i16x16_dir[7];
  92     int i_predict16x16;
  93
  94     int i_satd_i8x8;
  95     int i_cbp_i8x8_luma;
  96     int i_satd_i8x8_dir[12][4];
  97     int i_predict8x8[4];
  98
  99     int i_satd_i4x4;
 100     int i_predict4x4[16];
 101
 102     int i_satd_pcm;
 103
 104     /* Chroma part */
 105     int i_satd_i8x8chroma;
 106     int i_satd_i8x8chroma_dir[7];
 107     int i_predict8x8chroma;
 108
 109     /* II: Inter part P/B frame */
 110     x264_mb_analysis_list_t l0;
 111     x264_mb_analysis_list_t l1;
 112
 113     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 114     int i_cost16x16direct;
 115     int i_cost8x8bi;
 116     int i_cost8x8direct[4];
 117     int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
 118     int i_cost_est16x8[2]; /* Per-partition estimated cost */
 119     int i_cost_est8x16[2];
 120     int i_cost16x8bi;
 121     int i_cost8x16bi;
 122     int i_rd16x16bi;
 123     int i_rd16x16direct;
 124     int i_rd16x8bi;
 125     int i_rd8x16bi;
 126     int i_rd8x8bi;
 127
 128     int i_mb_partition16x8[2]; /* mb_partition_e */
 129     int i_mb_partition8x16[2];
 130     int i_mb_type16x8; /* mb_class_e */
 131     int i_mb_type8x16;
 132
 133     int b_direct_available;
 134
 135 } x264_mb_analysis_t;
 136
 137 /* lambda = pow(2,qp/6-2) */
 138 const uint8_t x264_lambda_tab[52] = {
 139    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 140    1, 1, 1, 1,              /*  8-11 */
 141    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 142    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 143    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 144   16,18,20,23,25,29,32,36,  /* 36-43 */
 145   40,45,51,57,64,72,81,91   /* 44-51 */
 146 };
 147
 148 /* lambda2 = pow(lambda,2) * .9 * 256 */
 149 const int x264_lambda2_tab[52] = {
 150     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 151     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 152    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 153   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 154  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 155 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 156 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 157 };
 158
 159 const uint8_t x264_exp2_lut[64] = {
 160       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 161      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 162     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 163     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 164 };
 165
 166 const float x264_log2_lut[128] = {
 167     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 168     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 169     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 170     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 171     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 172     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 173     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 174     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 175     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 176     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 177     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 178     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 179     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 180     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 181     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 182     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 183 };
 184
 185 /* Avoid an int/float conversion. */
 186 const float x264_log2_lz_lut[32] = {
 187     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 188 };
 189
 190 // should the intra and inter lambdas be different?
 191 // I'm just matching the behaviour of deadzone quant.
 192 static const int x264_trellis_lambda2_tab[2][52] = {
 193     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 194     {    46,      58,      73,      92,     117,     147,
 195         185,     233,     294,     370,     466,     587,
 196         740,     932,    1174,    1480,    1864,    2349,
 197        2959,    3728,    4697,    5918,    7457,    9395,
 198       11837,   14914,   18790,   23674,   29828,   37581,
 199       47349,   59656,   75163,   94699,  119313,  150326,
 200      189399,  238627,  300652,  378798,  477255,  601304,
 201      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 202     3030384, 3818045, 4810435, 6060769 },
 203     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 204     {    27,      34,      43,      54,      68,      86,
 205         108,     136,     172,     216,     273,     343,
 206         433,     545,     687,     865,    1090,    1374,
 207        1731,    2180,    2747,    3461,    4361,    5494,
 208        6922,    8721,   10988,   13844,   17442,   21976,
 209       27688,   34885,   43953,   55377,   69771,   87906,
 210      110755,  139543,  175813,  221511,  279087,  351627,
 211      443023,  558174,  703255,  886046, 1116348, 1406511,
 212     1772093, 2232697, 2813022, 3544186 }
 213 };
 214
 215 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 216        16,    20,    25,    32,    40,    50,
 217        64,    80,   101,   128,   161,   203,
 218       256,   322,   406,   512,   645,   812,
 219      1024,  1290,  1625,  2048,  2580,  3250,
 220      4096,  5160,  6501,  8192, 10321, 13003,
 221     16384, 20642, 26007, 32768, 41285, 52015,
 222     65535
 223 };
 224
 225 /* TODO: calculate CABAC costs */
 226 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 227     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 228 };
 229 static const uint8_t i_mb_b16x8_cost_table[17] = {
 230     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 231 };
 232 static const uint8_t i_sub_mb_b_cost_table[13] = {
 233     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 234 };
 235 static const uint8_t i_sub_mb_p_cost_table[4] = {
 236     5, 3, 3, 1
 237 };
 238
 239 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 240
 241 static uint16_t x264_cost_ref[92][3][33];
 242 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 243
 244 int x264_analyse_init_costs( x264_t *h, int qp )
 245 {
 246     int lambda = x264_lambda_tab[qp];
 247     if( h->cost_mv[lambda] )
 248         return 0;
 249     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 250     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 251     h->cost_mv[lambda] += 2*4*2048;
 252     for( int i = 0; i <= 2*4*2048; i++ )
 253     {
 254         h->cost_mv[lambda][-i] =
 255         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 256     }
 257     x264_pthread_mutex_lock( &cost_ref_mutex );
 258     for( int i = 0; i < 3; i++ )
 259         for( int j = 0; j < 33; j++ )
 260             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 261     x264_pthread_mutex_unlock( &cost_ref_mutex );
 262     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 263     {
 264         for( int j = 0; j < 4; j++ )
 265         {
 266             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 267             h->cost_mv_fpel[lambda][j] += 2*2048;
 268             for( int i = -2*2048; i < 2*2048; i++ )
 269                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 270         }
 271     }
 272     return 0;
 273 fail:
 274     return -1;
 275 }
 276
 277 void x264_analyse_free_costs( x264_t *h )
 278 {
 279     for( int i = 0; i < 92; i++ )
 280     {
 281         if( h->cost_mv[i] )
 282             x264_free( h->cost_mv[i] - 2*4*2048 );
 283         if( h->cost_mv_fpel[i][0] )
 284             for( int j = 0; j < 4; j++ )
 285                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 286     }
 287 }
 288
 289 void x264_analyse_weight_frame( x264_t *h, int end )
 290 {
 291     for( int j = 0; j < h->i_ref0; j++ )
 292     {
 293         if( h->sh.weight[j][0].weightfn )
 294         {
 295             x264_frame_t *frame = h->fref0[j];
 296             int width = frame->i_width[0] + 2*PADH;
 297             int i_padv = PADV << h->param.b_interlaced;
 298             int offset, height;
 299             uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 300             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 301             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 302             h->fenc->i_lines_weighted += height;
 303             if( height )
 304                 for( int k = j; k < h->i_ref0; k++ )
 305                     if( h->sh.weight[k][0].weightfn )
 306                     {
 307                         uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 308                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 309                                                  src + offset, frame->i_stride[0],
 310                                                  width, height, &h->sh.weight[k][0] );
 311                     }
 312             break;
 313         }
 314     }
 315 }
 316
 317 /* initialize an array of lambda*nbits for all possible mvs */
 318 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 319 {
 320     a->p_cost_mv = h->cost_mv[a->i_lambda];
 321     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 322     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 323 }
 324
 325 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 326 {
 327     /* conduct the analysis using this lamda and QP */
 328     a->i_qp = h->mb.i_qp = i_qp;
 329     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 330
 331     a->i_lambda = x264_lambda_tab[i_qp];
 332     a->i_lambda2 = x264_lambda2_tab[i_qp];
 333
 334     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 335     if( h->param.analyse.i_trellis )
 336     {
 337         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 338         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 339         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 340         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 341     }
 342     h->mb.i_psy_rd_lambda = a->i_lambda;
 343     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 344     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 345 }
 346
 347 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 348 {
 349     int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 350
 351     /* mbrd == 1 -> RD mode decision */
 352     /* mbrd == 2 -> RD refinement */
 353     /* mbrd == 3 -> QPRD */
 354     a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
 355
 356     x264_mb_analyse_init_qp( h, a, i_qp );
 357
 358     h->mb.b_transform_8x8 = 0;
 359     h->mb.b_noise_reduction = 0;
 360
 361     /* I: Intra part */
 362     a->i_satd_i16x16 =
 363     a->i_satd_i8x8   =
 364     a->i_satd_i4x4   =
 365     a->i_satd_i8x8chroma = COST_MAX;
 366
 367     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 368     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 369
 370     a->b_fast_intra = 0;
 371     h->mb.i_skip_intra =
 372         h->mb.b_lossless ? 0 :
 373         a->i_mbrd ? 2 :
 374         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 375
 376     /* II: Inter part P/B frame */
 377     if( h->sh.i_type != SLICE_TYPE_I )
 378     {
 379         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 380         // limit motion search to a slightly smaller range than the theoretical limit,
 381         // since the search may go a few iterations past its given range
 382         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 383
 384         /* Calculate max allowed MV range */
 385 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 386         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 387         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 388         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 389         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 390         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 391         {
 392             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 393             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 394             /* If we're left of the refresh bar, don't reference right of it. */
 395             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 396                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 397         }
 398         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 399         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 400         if( h->mb.i_mb_x == 0 )
 401         {
 402             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 403             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 404             int thread_mvy_range = i_fmv_range;
 405
 406             if( h->i_thread_frames > 1 )
 407             {
 408                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 409                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 410                 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 411                 {
 412                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 413                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 414                     for( int j = 0; j < i_ref; j++ )
 415                     {
 416                         x264_frame_cond_wait( fref[j]->orig, thresh );
 417                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 418                     }
 419                 }
 420
 421                 if( h->param.b_deterministic )
 422                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 423                 if( h->mb.b_interlaced )
 424                     thread_mvy_range >>= 1;
 425
 426                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 427             }
 428
 429             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 430             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 431             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 432             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 433             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 434             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 435             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 436         }
 437 #undef CLIP_FMV
 438
 439         a->l0.me16x16.cost =
 440         a->l0.i_rd16x16    =
 441         a->l0.i_cost8x8    =
 442         a->l0.i_cost16x8   =
 443         a->l0.i_cost8x16   = COST_MAX;
 444         if( h->sh.i_type == SLICE_TYPE_B )
 445         {
 446             a->l1.me16x16.cost =
 447             a->l1.i_rd16x16    =
 448             a->l1.i_cost8x8    =
 449             a->i_cost8x8direct[0] =
 450             a->i_cost8x8direct[1] =
 451             a->i_cost8x8direct[2] =
 452             a->i_cost8x8direct[3] =
 453             a->l1.i_cost16x8   =
 454             a->l1.i_cost8x16   =
 455             a->i_rd16x16bi     =
 456             a->i_rd16x16direct =
 457             a->i_rd8x8bi       =
 458             a->i_rd16x8bi      =
 459             a->i_rd8x16bi      =
 460             a->i_cost16x16bi   =
 461             a->i_cost16x16direct =
 462             a->i_cost8x8bi     =
 463             a->i_cost16x8bi    =
 464             a->i_cost8x16bi    = COST_MAX;
 465         }
 466         else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
 467             for( int i = 0; i < 4; i++ )
 468             {
 469                 a->l0.i_cost4x4[i] =
 470                 a->l0.i_cost8x4[i] =
 471                 a->l0.i_cost4x8[i] = COST_MAX;
 472             }
 473
 474         /* Fast intra decision */
 475         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 476         {
 477             /* Always run in fast-intra mode for subme < 3 */
 478             if( h->mb.i_subpel_refine > 2 &&
 479               ( IS_INTRA( h->mb.i_mb_type_left ) ||
 480                 IS_INTRA( h->mb.i_mb_type_top ) ||
 481                 IS_INTRA( h->mb.i_mb_type_topleft ) ||
 482                 IS_INTRA( h->mb.i_mb_type_topright ) ||
 483                 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) ||
 484                 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
 485             { /* intra is likely */ }
 486             else
 487             {
 488                 a->b_fast_intra = 1;
 489             }
 490         }
 491         h->mb.b_skip_mc = 0;
 492         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 493             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 494         {
 495             a->b_force_intra = 1;
 496             a->b_fast_intra = 0;
 497         }
 498         else
 499             a->b_force_intra = 0;
 500     }
 501 }
 502
 503 /* Prediction modes allowed for various combinations of neighbors. */
 504 /* Terminated by a -1. */
 505 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 506 static const int8_t i16x16_mode_available[5][5] =
 507 {
 508     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 509     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 510     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 511     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 512     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 513 };
 514
 515 static const int8_t i8x8chroma_mode_available[5][5] =
 516 {
 517     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 518     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 519     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 520     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 521     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 522 };
 523
 524 static const int8_t i4x4_mode_available[5][10] =
 525 {
 526     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 527     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 528     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 529     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 530     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 531 };
 532
 533 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
 534 {
 535     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 536     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 537 }
 538
 539 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 540 {
 541     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 542     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 543 }
 544
 545 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
 546 {
 547     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 548     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 549 }
 550
 551 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 552 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 553 {
 554     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 555
 556     if( do_both_dct || h->mb.b_transform_8x8 )
 557         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 558     if( do_both_dct || !h->mb.b_transform_8x8 )
 559         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 560 }
 561
 562 /* Reset fenc satd scores cache for psy RD */
 563 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 564 {
 565     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 566         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 567     if( !h->mb.i_psy_rd )
 568         return;
 569     /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
 570     h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
 571     if( b_satd )
 572         h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 573 }
 574
 575 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 576 {
 577     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 578
 579     if( a->i_satd_i8x8chroma < COST_MAX )
 580         return;
 581
 582     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 583
 584     /* 8x8 prediction selection for chroma */
 585     if( predict_mode[3] >= 0 && b_merged_satd )
 586     {
 587         int satdu[4], satdv[4];
 588         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 589         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 590         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 591         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 592         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 593         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 594
 595         for( ; *predict_mode >= 0; predict_mode++ )
 596         {
 597             int i_mode = *predict_mode;
 598             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 599
 600             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 601             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 602         }
 603     }
 604     else
 605     {
 606         for( ; *predict_mode >= 0; predict_mode++ )
 607         {
 608             int i_satd;
 609             int i_mode = *predict_mode;
 610
 611             /* we do the prediction */
 612             if( h->mb.b_lossless )
 613                 x264_predict_lossless_8x8_chroma( h, i_mode );
 614             else
 615             {
 616                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 617                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 618             }
 619
 620             /* we calculate the cost */
 621             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 622                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 623                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 624
 625             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 626             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 627         }
 628     }
 629
 630     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 631 }
 632
 633 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 634 {
 635     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 636     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 637     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 638
 639     int idx;
 640     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 641
 642     /*---------------- Try all mode and calculate their score ---------------*/
 643
 644     /* 16x16 prediction selection */
 645     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 646
 647     if( b_merged_satd && predict_mode[3] >= 0 )
 648     {
 649         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 650         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 651         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 652             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 653         for( int i = 0; i < 4; i++ )
 654         {
 655             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 656             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 657         }
 658     }
 659     else
 660     {
 661         for( ; *predict_mode >= 0; predict_mode++ )
 662         {
 663             int i_satd;
 664             int i_mode = *predict_mode;
 665
 666             if( h->mb.b_lossless )
 667                 x264_predict_lossless_16x16( h, i_mode );
 668             else
 669                 h->predict_16x16[i_mode]( p_dst );
 670
 671             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 672                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 673             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 674             a->i_satd_i16x16_dir[i_mode] = i_satd;
 675         }
 676     }
 677
 678     if( h->sh.i_type == SLICE_TYPE_B )
 679         /* cavlc mb type prefix */
 680         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 681
 682     /* Not heavily tuned */
 683     const uint8_t i16x16_thresh[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
 684     if( a->b_fast_intra && a->i_satd_i16x16 > (i16x16_thresh[h->mb.i_subpel_refine]*i_satd_inter)>>1 )
 685         return;
 686
 687     /* 8x8 prediction selection */
 688     if( flags & X264_ANALYSE_I8x8 )
 689     {
 690         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 691         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 692         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 693
 694         // FIXME some bias like in i4x4?
 695         int i_cost = a->i_lambda * 4; /* base predmode costs */
 696         h->mb.i_cbp_luma = 0;
 697         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 698
 699         if( h->sh.i_type == SLICE_TYPE_B )
 700             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 701
 702         for( idx = 0;; idx++ )
 703         {
 704             int x = idx&1;
 705             int y = idx>>1;
 706             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 707             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 708             int i_best = COST_MAX;
 709             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 710
 711             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 712             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 713
 714             if( b_merged_satd && predict_mode[8] >= 0 )
 715             {
 716                 int satd[9];
 717                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 718                 satd[i_pred_mode] -= 3 * a->i_lambda;
 719                 for( int i = 2; i >= 0; i-- )
 720                 {
 721                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i];
 722                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 723                 }
 724                 predict_mode += 3;
 725             }
 726
 727             for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
 728             {
 729                 int i_satd;
 730                 int i_mode = *predict_mode;
 731
 732                 if( h->mb.b_lossless )
 733                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 734                 else
 735                     h->predict_8x8[i_mode]( p_dst_by, edge );
 736
 737                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 738                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 739                     i_satd -= 3 * a->i_lambda;
 740
 741                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 742                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * a->i_lambda;
 743             }
 744             i_cost += i_best + 3 * a->i_lambda;
 745
 746             if( idx == 3 || i_cost > i_satd_thresh )
 747                 break;
 748
 749             /* we need to encode this block now (for next ones) */
 750             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 751             x264_mb_encode_i8x8( h, idx, a->i_qp );
 752
 753             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 754         }
 755
 756         if( idx == 3 )
 757         {
 758             a->i_satd_i8x8 = i_cost;
 759             if( h->mb.i_skip_intra )
 760             {
 761                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 762                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 763                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 764                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 765                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 766                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 767                 if( h->mb.i_skip_intra == 2 )
 768                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 769             }
 770         }
 771         else
 772         {
 773             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 774             a->i_satd_i8x8 = COST_MAX;
 775             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 776         }
 777         /* Not heavily tuned */
 778         const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
 779         if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
 780             return;
 781     }
 782
 783     /* 4x4 prediction selection */
 784     if( flags & X264_ANALYSE_I4x4 )
 785     {
 786         int i_cost = a->i_lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
 787         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 788         h->mb.i_cbp_luma = 0;
 789         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 790         if( a->i_mbrd )
 791             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 792
 793         if( h->sh.i_type == SLICE_TYPE_B )
 794             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 795
 796         for( idx = 0;; idx++ )
 797         {
 798             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 799             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 800             int i_best = COST_MAX;
 801             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 802
 803             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 804
 805             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 806                 /* emulate missing topright samples */
 807                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 808
 809             if( b_merged_satd && predict_mode[5] >= 0 )
 810             {
 811                 int satd[9];
 812                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 813                 satd[i_pred_mode] -= 3 * a->i_lambda;
 814                 for( int i = 2; i >= 0; i-- )
 815                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 816                 predict_mode += 3;
 817             }
 818
 819             if( i_best > 0 )
 820             {
 821                 for( ; *predict_mode >= 0; predict_mode++ )
 822                 {
 823                     int i_satd;
 824                     int i_mode = *predict_mode;
 825
 826                     if( h->mb.b_lossless )
 827                         x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 828                     else
 829                         h->predict_4x4[i_mode]( p_dst_by );
 830
 831                     i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 832                     if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 833                     {
 834                         i_satd -= a->i_lambda * 3;
 835                         if( i_satd <= 0 )
 836                         {
 837                             i_best = i_satd;
 838                             a->i_predict4x4[idx] = i_mode;
 839                             break;
 840                         }
 841                     }
 842
 843                     COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 844                 }
 845             }
 846             i_cost += i_best + 3 * a->i_lambda;
 847
 848             if( i_cost > i_satd_thresh || idx == 15 )
 849                 break;
 850
 851             /* we need to encode this block now (for next ones) */
 852             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 853             x264_mb_encode_i4x4( h, idx, a->i_qp );
 854
 855             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 856         }
 857         if( idx == 15 )
 858         {
 859             a->i_satd_i4x4 = i_cost;
 860             if( h->mb.i_skip_intra )
 861             {
 862                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 863                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 864                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 865                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 866                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 867                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 868                 if( h->mb.i_skip_intra == 2 )
 869                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 870             }
 871         }
 872         else
 873             a->i_satd_i4x4 = COST_MAX;
 874     }
 875 }
 876
 877 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 878 {
 879     if( a->i_satd_i16x16 <= i_satd_thresh )
 880     {
 881         h->mb.i_type = I_16x16;
 882         x264_analyse_update_cache( h, a );
 883         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 884     }
 885     else
 886         a->i_satd_i16x16 = COST_MAX;
 887
 888     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 889     {
 890         h->mb.i_type = I_4x4;
 891         x264_analyse_update_cache( h, a );
 892         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 893     }
 894     else
 895         a->i_satd_i4x4 = COST_MAX;
 896
 897     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 898     {
 899         h->mb.i_type = I_8x8;
 900         x264_analyse_update_cache( h, a );
 901         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 902         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 903     }
 904     else
 905         a->i_satd_i8x8 = COST_MAX;
 906 }
 907
 908 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 909 {
 910     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 911
 912     int x, y;
 913     uint64_t i_satd, i_best;
 914     h->mb.i_skip_intra = 0;
 915
 916     if( h->mb.i_type == I_16x16 )
 917     {
 918         int old_pred_mode = a->i_predict16x16;
 919         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 920         int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 921         i_best = a->i_satd_i16x16;
 922         for( ; *predict_mode >= 0; predict_mode++ )
 923         {
 924             int i_mode = *predict_mode;
 925             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 926                 continue;
 927             h->mb.i_intra16x16_pred_mode = i_mode;
 928             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 929             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 930         }
 931     }
 932
 933     /* RD selection for chroma prediction */
 934     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 935     if( predict_mode[1] >= 0 )
 936     {
 937         int8_t predict_mode_sorted[4];
 938         int i_max;
 939         int i_thresh = a->i_satd_i8x8chroma * 5/4;
 940
 941         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 942         {
 943             int i_mode = *predict_mode;
 944             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 945                 predict_mode_sorted[i_max++] = i_mode;
 946         }
 947
 948         if( i_max > 0 )
 949         {
 950             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 951             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 952             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 953              * coefs for the current chroma mode are still around, so we only
 954              * have to recount the bits. */
 955             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 956             for( int i = 0; i < i_max; i++ )
 957             {
 958                 int i_mode = predict_mode_sorted[i];
 959                 if( h->mb.b_lossless )
 960                     x264_predict_lossless_8x8_chroma( h, i_mode );
 961                 else
 962                 {
 963                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 964                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 965                 }
 966                 /* if we've already found a mode that needs no residual, then
 967                  * probably any mode with a residual will be worse.
 968                  * so avoid dct on the remaining modes to improve speed. */
 969                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 970                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 971             }
 972             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 973             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 974         }
 975     }
 976
 977     if( h->mb.i_type == I_4x4 )
 978     {
 979         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
 980         int i_nnz = 0;
 981         for( int idx = 0; idx < 16; idx++ )
 982         {
 983             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 984             i_best = COST_MAX64;
 985
 986             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 987
 988             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 989                 /* emulate missing topright samples */
 990                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 991
 992             for( ; *predict_mode >= 0; predict_mode++ )
 993             {
 994                 int i_mode = *predict_mode;
 995                 if( h->mb.b_lossless )
 996                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 997                 else
 998                     h->predict_4x4[i_mode]( p_dst_by );
 999                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1000
1001                 if( i_best > i_satd )
1002                 {
1003                     a->i_predict4x4[idx] = i_mode;
1004                     i_best = i_satd;
1005                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1006                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1007                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1008                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1009                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1010                 }
1011             }
1012
1013             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1014             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1015             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1016             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1017             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1018
1019             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1020         }
1021     }
1022     else if( h->mb.i_type == I_8x8 )
1023     {
1024         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1025         for( int idx = 0; idx < 4; idx++ )
1026         {
1027             uint64_t pels_h = 0;
1028             uint8_t pels_v[7];
1029             uint16_t i_nnz[2] = {0}; //shut up gcc
1030             uint8_t *p_dst_by;
1031             int cbp_luma_new = 0;
1032             int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1033
1034             i_best = COST_MAX64;
1035             x = idx&1;
1036             y = idx>>1;
1037
1038             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1039             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1040             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1041
1042             for( ; *predict_mode >= 0; predict_mode++ )
1043             {
1044                 int i_mode = *predict_mode;
1045                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1046                     continue;
1047
1048                 if( h->mb.b_lossless )
1049                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1050                 else
1051                     h->predict_8x8[i_mode]( p_dst_by, edge );
1052                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1053                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1054
1055                 if( i_best > i_satd )
1056                 {
1057                     a->i_predict8x8[idx] = i_mode;
1058                     cbp_luma_new = h->mb.i_cbp_luma;
1059                     i_best = i_satd;
1060
1061                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1062                     if( !(idx&1) )
1063                         for( int j = 0; j < 7; j++ )
1064                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1065                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1066                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1067                 }
1068             }
1069             a->i_cbp_i8x8_luma = cbp_luma_new;
1070             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1071             if( !(idx&1) )
1072                 for( int j = 0; j < 7; j++ )
1073                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1074             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1075             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1076
1077             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1078         }
1079     }
1080 }
1081
1082 #define LOAD_FENC( m, src, xoff, yoff) \
1083     (m)->p_cost_mv = a->p_cost_mv; \
1084     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1085     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1086     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1087     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1088     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1089
1090 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1091     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1092     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1093     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1094     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1095     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1096     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1097     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1098     (m)->weight = weight_none; \
1099     (m)->i_ref = ref;
1100
1101 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1102     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1103     (m)->weight = h->sh.weight[i_ref];
1104
1105 #define REF_COST(list, ref) \
1106     (a->p_cost_ref[list][ref])
1107
1108 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1109 {
1110     x264_me_t m;
1111     int i_mvc;
1112     ALIGNED_4( int16_t mvc[8][2] );
1113     int i_halfpel_thresh = INT_MAX;
1114     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1115
1116     /* 16x16 Search on all ref frame */
1117     m.i_pixel = PIXEL_16x16;
1118     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1119
1120     a->l0.me16x16.cost = INT_MAX;
1121     for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1122     {
1123         m.i_ref_cost = REF_COST( 0, i_ref );
1124         i_halfpel_thresh -= m.i_ref_cost;
1125
1126         /* search with ref */
1127         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1128         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1129
1130         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1131
1132         if( h->mb.ref_blind_dupe == i_ref )
1133         {
1134             CP32( m.mv, a->l0.mvc[0][0] );
1135             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1136         }
1137         else
1138         {
1139             x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1140             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1141         }
1142
1143         /* save mv for predicting neighbors */
1144         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1145         CP32( a->l0.mvc[i_ref][0], m.mv );
1146
1147         /* early termination
1148          * SSD threshold would probably be better than SATD */
1149         if( i_ref == 0
1150             && a->b_try_skip
1151             && m.cost-m.cost_mv < 300*a->i_lambda
1152             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1153               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1154             && x264_macroblock_probe_pskip( h ) )
1155         {
1156             h->mb.i_type = P_SKIP;
1157             x264_analyse_update_cache( h, a );
1158             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1159             return;
1160         }
1161
1162         m.cost += m.i_ref_cost;
1163         i_halfpel_thresh += m.i_ref_cost;
1164
1165         if( m.cost < a->l0.me16x16.cost )
1166             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1167     }
1168
1169     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1170     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1171
1172     h->mb.i_type = P_L0;
1173     if( a->i_mbrd )
1174     {
1175         x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1176         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1177         {
1178             h->mb.i_partition = D_16x16;
1179             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1180             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1181             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1182                 h->mb.i_type = P_SKIP;
1183         }
1184     }
1185 }
1186
1187 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1188 {
1189     x264_me_t m;
1190     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1191     int i_maxref = h->mb.pic.i_fref[0]-1;
1192
1193     h->mb.i_partition = D_8x8;
1194
1195     #define CHECK_NEIGHBOUR(i)\
1196     {\
1197         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1198         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1199             i_maxref = ref;\
1200     }
1201
1202     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1203      * than those used by the neighbors */
1204     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1205         h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1206     {
1207         i_maxref = 0;
1208         CHECK_NEIGHBOUR(  -8 - 1 );
1209         CHECK_NEIGHBOUR(  -8 + 0 );
1210         CHECK_NEIGHBOUR(  -8 + 2 );
1211         CHECK_NEIGHBOUR(  -8 + 4 );
1212         CHECK_NEIGHBOUR(   0 - 1 );
1213         CHECK_NEIGHBOUR( 2*8 - 1 );
1214     }
1215     #undef CHECK_NEIGHBOUR
1216
1217     for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1218         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1219
1220     for( int i = 0; i < 4; i++ )
1221     {
1222         x264_me_t *l0m = &a->l0.me8x8[i];
1223         const int x8 = i%2;
1224         const int y8 = i/2;
1225
1226         m.i_pixel = PIXEL_8x8;
1227
1228         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1229         l0m->cost = INT_MAX;
1230         for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1231         {
1232             m.i_ref_cost = REF_COST( 0, i_ref );
1233
1234             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1235             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1236
1237             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1238             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1239             if( h->mb.ref_blind_dupe == i_ref )
1240             {
1241                 CP32( m.mv, a->l0.mvc[0][i+1] );
1242                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1243             }
1244             else
1245                 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1246
1247             m.cost += m.i_ref_cost;
1248
1249             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1250
1251             if( m.cost < l0m->cost )
1252                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1253             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1254                 i_ref = h->mb.ref_blind_dupe;
1255             else
1256                 i_ref++;
1257         }
1258         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1259         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1260
1261         a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1262
1263         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1264            are effectively zero. */
1265         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1266             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1267     }
1268
1269     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1270                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1271     /* P_8x8 ref0 has no ref cost */
1272     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1273                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1274         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1275     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1276     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1277 }
1278
1279 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1280 {
1281     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1282      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1283      * don't bother analysing the dupes. */
1284     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1285     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1286     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1287     int i_mvc;
1288     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1289
1290     /* XXX Needed for x264_mb_predict_mv */
1291     h->mb.i_partition = D_8x8;
1292
1293     i_mvc = 1;
1294     CP32( mvc[0], a->l0.me16x16.mv );
1295
1296     for( int i = 0; i < 4; i++ )
1297     {
1298         x264_me_t *m = &a->l0.me8x8[i];
1299         const int x8 = i%2;
1300         const int y8 = i/2;
1301
1302         m->i_pixel = PIXEL_8x8;
1303         m->i_ref_cost = i_ref_cost;
1304
1305         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1306         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1307         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1308
1309         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1310         x264_me_search( h, m, mvc, i_mvc );
1311
1312         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1313
1314         CP32( mvc[i_mvc], m->mv );
1315         i_mvc++;
1316
1317         a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1318
1319         /* mb type cost */
1320         m->cost += i_ref_cost;
1321         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1322             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1323     }
1324
1325     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1326                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1327     /* theoretically this should include 4*ref_cost,
1328      * but 3 seems a better approximation of cabac. */
1329     if( h->param.b_cabac )
1330         a->l0.i_cost8x8 -= i_ref_cost;
1331     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1332     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1333 }
1334
1335 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1336 {
1337     x264_me_t m;
1338     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1339     ALIGNED_4( int16_t mvc[3][2] );
1340
1341     /* XXX Needed for x264_mb_predict_mv */
1342     h->mb.i_partition = D_16x8;
1343
1344     for( int i = 0; i < 2; i++ )
1345     {
1346         x264_me_t *l0m = &a->l0.me16x8[i];
1347         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1348         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1349         const int ref8[2] = { minref, maxref };
1350         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1351
1352         m.i_pixel = PIXEL_16x8;
1353
1354         LOAD_FENC( &m, p_fenc, 0, 8*i );
1355         l0m->cost = INT_MAX;
1356         for( int j = 0; j < i_ref8s; j++ )
1357         {
1358             const int i_ref = ref8[j];
1359             m.i_ref_cost = REF_COST( 0, i_ref );
1360
1361             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1362             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1363             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1364             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1365
1366             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1367             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1368
1369             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1370             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1371             /* We can only take this shortcut if the first search was performed on ref0. */
1372             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1373             {
1374                 /* We can just leave the MV from the previous ref search. */
1375                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1376             }
1377             else
1378                 x264_me_search( h, &m, mvc, 3 );
1379
1380             m.cost += m.i_ref_cost;
1381
1382             if( m.cost < l0m->cost )
1383                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1384         }
1385
1386         /* Early termination based on the current SATD score of partition[0]
1387            plus the estimated SATD score of partition[1] */
1388         if( !i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1389         {
1390             a->l0.i_cost16x8 = COST_MAX;
1391             return;
1392         }
1393
1394         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1395         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1396     }
1397
1398     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1399 }
1400
1401 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1402 {
1403     x264_me_t m;
1404     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1405     ALIGNED_4( int16_t mvc[3][2] );
1406
1407     /* XXX Needed for x264_mb_predict_mv */
1408     h->mb.i_partition = D_8x16;
1409
1410     for( int i = 0; i < 2; i++ )
1411     {
1412         x264_me_t *l0m = &a->l0.me8x16[i];
1413         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1414         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1415         const int ref8[2] = { minref, maxref };
1416         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1417
1418         m.i_pixel = PIXEL_8x16;
1419
1420         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1421         l0m->cost = INT_MAX;
1422         for( int j = 0; j < i_ref8s; j++ )
1423         {
1424             const int i_ref = ref8[j];
1425             m.i_ref_cost = REF_COST( 0, i_ref );
1426
1427             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1428             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1429             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1430
1431             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1432             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1433
1434             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1435             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1436             /* We can only take this shortcut if the first search was performed on ref0. */
1437             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1438             {
1439                 /* We can just leave the MV from the previous ref search. */
1440                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1441             }
1442             else
1443                 x264_me_search( h, &m, mvc, 3 );
1444
1445             m.cost += m.i_ref_cost;
1446
1447             if( m.cost < l0m->cost )
1448                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1449         }
1450
1451         /* Early termination based on the current SATD score of partition[0]
1452            plus the estimated SATD score of partition[1] */
1453         if( !i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1454         {
1455             a->l0.i_cost8x16 = COST_MAX;
1456             return;
1457         }
1458
1459         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1460         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1461     }
1462
1463     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1464 }
1465
1466 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1467 {
1468     ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1469     uint8_t *pix2 = pix1+8;
1470     const int i_stride = h->mb.pic.i_stride[1];
1471     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1472     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1473     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1474     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1475     x264_weight_t *weight = h->sh.weight[i_ref];
1476
1477 #define CHROMA4x4MC( width, height, me, x, y ) \
1478     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1479     if( weight[1].weightfn ) \
1480         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1481     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1482     if( weight[2].weightfn ) \
1483         weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1484
1485
1486     if( pixel == PIXEL_4x4 )
1487     {
1488         x264_me_t *m = a->l0.me4x4[i8x8];
1489         CHROMA4x4MC( 2,2, m[0], 0,0 );
1490         CHROMA4x4MC( 2,2, m[1], 2,0 );
1491         CHROMA4x4MC( 2,2, m[2], 0,2 );
1492         CHROMA4x4MC( 2,2, m[3], 2,2 );
1493     }
1494     else if( pixel == PIXEL_8x4 )
1495     {
1496         x264_me_t *m = a->l0.me8x4[i8x8];
1497         CHROMA4x4MC( 4,2, m[0], 0,0 );
1498         CHROMA4x4MC( 4,2, m[1], 0,2 );
1499     }
1500     else
1501     {
1502         x264_me_t *m = a->l0.me4x8[i8x8];
1503         CHROMA4x4MC( 2,4, m[0], 0,0 );
1504         CHROMA4x4MC( 2,4, m[1], 2,0 );
1505     }
1506
1507     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1508          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1509 }
1510
1511 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1512 {
1513     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1514     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1515     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1516
1517     /* XXX Needed for x264_mb_predict_mv */
1518     h->mb.i_partition = D_8x8;
1519
1520     for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1521     {
1522         const int idx = 4*i8x8 + i4x4;
1523         const int x4 = block_idx_x[idx];
1524         const int y4 = block_idx_y[idx];
1525         const int i_mvc = (i4x4 == 0);
1526
1527         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1528
1529         m->i_pixel = PIXEL_4x4;
1530
1531         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1532         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1533         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1534
1535         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1536         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1537
1538         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1539     }
1540     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1541                             a->l0.me4x4[i8x8][1].cost +
1542                             a->l0.me4x4[i8x8][2].cost +
1543                             a->l0.me4x4[i8x8][3].cost +
1544                             REF_COST( 0, i_ref ) +
1545                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1546     if( h->mb.b_chroma_me )
1547         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1548 }
1549
1550 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1551 {
1552     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1553     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1554     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1555
1556     /* XXX Needed for x264_mb_predict_mv */
1557     h->mb.i_partition = D_8x8;
1558
1559     for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1560     {
1561         const int idx = 4*i8x8 + 2*i8x4;
1562         const int x4 = block_idx_x[idx];
1563         const int y4 = block_idx_y[idx];
1564         const int i_mvc = (i8x4 == 0);
1565
1566         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1567
1568         m->i_pixel = PIXEL_8x4;
1569
1570         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1571         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1572         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1573
1574         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1575         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1576
1577         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1578     }
1579     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1580                             REF_COST( 0, i_ref ) +
1581                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1582     if( h->mb.b_chroma_me )
1583         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1584 }
1585
1586 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1587 {
1588     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1589     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1590     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1591
1592     /* XXX Needed for x264_mb_predict_mv */
1593     h->mb.i_partition = D_8x8;
1594
1595     for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1596     {
1597         const int idx = 4*i8x8 + i4x8;
1598         const int x4 = block_idx_x[idx];
1599         const int y4 = block_idx_y[idx];
1600         const int i_mvc = (i4x8 == 0);
1601
1602         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1603
1604         m->i_pixel = PIXEL_4x8;
1605
1606         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1607         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1608         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1609
1610         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1611         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1612
1613         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1614     }
1615     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1616                             REF_COST( 0, i_ref ) +
1617                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1618     if( h->mb.b_chroma_me )
1619         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1620 }
1621
1622 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1623 {
1624     /* Assumes that fdec still contains the results of
1625      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1626
1627     uint8_t *p_fenc = h->mb.pic.p_fenc[0];
1628     uint8_t *p_fdec = h->mb.pic.p_fdec[0];
1629
1630     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1631     if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1632         for( int i = 0; i < 4; i++ )
1633         {
1634             const int x = (i&1)*8;
1635             const int y = (i>>1)*8;
1636             a->i_cost16x16direct +=
1637             a->i_cost8x8direct[i] =
1638                 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1639
1640             /* mb type cost */
1641             a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1642         }
1643     else
1644         a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1645 }
1646
1647 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1648 {
1649     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1650     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1651     uint8_t *src0, *src1;
1652     int stride0 = 16, stride1 = 16;
1653     int i_ref, i_mvc;
1654     ALIGNED_4( int16_t mvc[9][2] );
1655     int try_skip = a->b_try_skip;
1656     int list1_skipped = 0;
1657     int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1658     int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1659                                 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1660
1661     x264_me_t m;
1662     m.i_pixel = PIXEL_16x16;
1663
1664     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1665
1666     /* 16x16 Search on list 0 and list 1 */
1667     a->l0.me16x16.cost = INT_MAX;
1668     a->l1.me16x16.cost = INT_MAX;
1669     for( int l = 1; l >= 0; )
1670     {
1671         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1672
1673         /* This loop is extremely munged in order to facilitate the following order of operations,
1674          * necessary for an efficient fast skip.
1675          * 1.  Search list1 ref0.
1676          * 2.  Search list0 ref0.
1677          * 3.  Try skip.
1678          * 4.  Search the rest of list0.
1679          * 5.  Go back and finish list1.
1680          */
1681         for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1682         {
1683             if( try_skip && l == 1 && i_ref > 0 )
1684             {
1685                 list1_skipped = 1;
1686                 break;
1687             }
1688
1689             m.i_ref_cost = REF_COST( l, i_ref );
1690
1691             /* search with ref */
1692             LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1693             x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1694             x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1695             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1696
1697             /* add ref cost */
1698             m.cost += m.i_ref_cost;
1699
1700             if( m.cost < lX->me16x16.cost )
1701                 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1702
1703             /* save mv for predicting neighbors */
1704             CP32( lX->mvc[i_ref][0], m.mv );
1705             CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1706
1707             /* Fast skip detection. */
1708             if( i_ref == 0 && try_skip )
1709             {
1710                 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1711                     abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1712                 {
1713                     try_skip = 0;
1714                 }
1715                 else if( !l )
1716                 {
1717                     /* We already tested skip */
1718                     h->mb.i_type = B_SKIP;
1719                     x264_analyse_update_cache( h, a );
1720                     return;
1721                 }
1722             }
1723         }
1724         if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1725             break;
1726         if( list1_skipped && l == 0 )
1727             l = 1;
1728         else
1729             l--;
1730     }
1731
1732     /* get cost of BI mode */
1733     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1734     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1735     int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1736     src0 = h->mc.get_ref( pix0, &stride0,
1737                           h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1738                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1739     src1 = h->mc.get_ref( pix1, &stride1,
1740                           h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1741                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1742
1743     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1744
1745     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1746                      + ref_costs
1747                      + a->l0.bi16x16.cost_mv
1748                      + a->l1.bi16x16.cost_mv;
1749
1750     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1751     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1752     {
1753         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1754                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1755         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1756                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1757         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1758                                 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1759                                 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1760         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1761                    + ref_costs + l0_mv_cost + l1_mv_cost;
1762         if( cost00 < a->i_cost16x16bi )
1763         {
1764             M32( a->l0.bi16x16.mv ) = 0;
1765             M32( a->l1.bi16x16.mv ) = 0;
1766             a->l0.bi16x16.cost_mv = l0_mv_cost;
1767             a->l1.bi16x16.cost_mv = l1_mv_cost;
1768             a->i_cost16x16bi = cost00;
1769         }
1770     }
1771
1772     /* mb type cost */
1773     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1774     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1775     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1776 }
1777
1778 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1779 {
1780     const int x = 2*(i%2);
1781     const int y = 2*(i/2);
1782
1783     switch( h->mb.i_sub_partition[i] )
1784     {
1785         case D_L0_8x8:
1786             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1787             break;
1788         case D_L0_8x4:
1789             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1790             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1791             break;
1792         case D_L0_4x8:
1793             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1794             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1795             break;
1796         case D_L0_4x4:
1797             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1798             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1799             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1800             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1801             break;
1802         default:
1803             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1804             break;
1805     }
1806 }
1807
1808 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1809 {
1810     const int x = 2*(idx&1);
1811     const int y = 2*(idx>>1);
1812     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1813     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1814     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1815     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1816 }
1817
1818 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1819     if( x264_mb_partition_listX_table[0][part] ) \
1820     { \
1821         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1822         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1823     } \
1824     else \
1825     { \
1826         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1827         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1828         if( b_mvd ) \
1829             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1830     } \
1831     if( x264_mb_partition_listX_table[1][part] ) \
1832     { \
1833         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1834         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1835     } \
1836     else \
1837     { \
1838         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1839         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1840         if( b_mvd ) \
1841             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1842     }
1843
1844 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1845 {
1846     int x = (i%2)*2;
1847     int y = (i/2)*2;
1848     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1849     {
1850         x264_mb_load_mv_direct8x8( h, i );
1851         if( b_mvd )
1852         {
1853             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1854             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1855             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1856         }
1857     }
1858     else
1859     {
1860         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1861     }
1862 }
1863 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1864 {
1865     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1866 }
1867 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1868 {
1869     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1870 }
1871 #undef CACHE_MV_BI
1872
1873 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1874 {
1875     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1876     int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1877
1878     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1879      * than those used by the neighbors */
1880     #define CHECK_NEIGHBOUR(i)\
1881     {\
1882         int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1883         if( ref > i_maxref[l] )\
1884             i_maxref[l] = ref;\
1885     }
1886
1887     for( int l = 0; l < 2; l++ )
1888     {
1889         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1890         if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1891             h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1892         {
1893             i_maxref[l] = 0;
1894             CHECK_NEIGHBOUR(  -8 - 1 );
1895             CHECK_NEIGHBOUR(  -8 + 0 );
1896             CHECK_NEIGHBOUR(  -8 + 2 );
1897             CHECK_NEIGHBOUR(  -8 + 4 );
1898             CHECK_NEIGHBOUR(   0 - 1 );
1899             CHECK_NEIGHBOUR( 2*8 - 1 );
1900         }
1901     }
1902
1903     /* XXX Needed for x264_mb_predict_mv */
1904     h->mb.i_partition = D_8x8;
1905
1906     a->i_cost8x8bi = 0;
1907
1908     for( int i = 0; i < 4; i++ )
1909     {
1910         int x8 = i%2;
1911         int y8 = i/2;
1912         int i_part_cost;
1913         int i_part_cost_bi;
1914         int stride[2] = {8,8};
1915         uint8_t *src[2];
1916         x264_me_t m;
1917         m.i_pixel = PIXEL_8x8;
1918         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1919
1920         for( int l = 0; l < 2; l++ )
1921         {
1922             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1923
1924             lX->me8x8[i].cost = INT_MAX;
1925             for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1926             {
1927                 m.i_ref_cost = REF_COST( l, i_ref );;
1928
1929                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1930
1931                 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1932                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1933                 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1934                 m.cost += m.i_ref_cost;
1935
1936                 if( m.cost < lX->me8x8[i].cost )
1937                 {
1938                     h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1939                     a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
1940                 }
1941
1942                 /* save mv for predicting other partitions within this MB */
1943                 CP32( lX->mvc[i_ref][i+1], m.mv );
1944             }
1945         }
1946
1947         /* BI mode */
1948         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1949                                 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1950         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1951                                 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1952         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1953                                 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1954
1955         a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1956         i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
1957                          + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
1958                          + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1959
1960         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1961         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1962
1963         i_part_cost = a->l0.me8x8[i].cost;
1964         h->mb.i_sub_partition[i] = D_L0_8x8;
1965         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1966         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1967         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1968         a->i_cost8x8bi += i_part_cost;
1969
1970         /* XXX Needed for x264_mb_predict_mv */
1971         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1972     }
1973
1974     /* mb type cost */
1975     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1976 }
1977
1978 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1979 {
1980     uint8_t **p_fref[2] =
1981         { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1982           h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1983     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1984
1985     /* XXX Needed for x264_mb_predict_mv */
1986     h->mb.i_partition = D_8x8;
1987
1988     a->i_cost8x8bi = 0;
1989
1990     for( int i = 0; i < 4; i++ )
1991     {
1992         const int x8 = i%2;
1993         const int y8 = i/2;
1994         int i_part_cost;
1995         int i_part_cost_bi = 0;
1996         int stride[2] = {8,8};
1997         uint8_t *src[2];
1998
1999         for( int l = 0; l < 2; l++ )
2000         {
2001             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2002             x264_me_t *m = &lX->me8x8[i];
2003             m->i_pixel = PIXEL_8x8;
2004             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2005
2006             m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2007             m->i_ref = lX->me16x16.i_ref;
2008
2009             LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2010
2011             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2012             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2013             x264_me_search( h, m, &lX->me16x16.mv, 1 );
2014             a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2015             m->cost += m->i_ref_cost;
2016
2017             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2018
2019             /* save mv for predicting other partitions within this MB */
2020             CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2021
2022             /* BI mode */
2023             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2024                                     m->mv[0], m->mv[1], 8, 8, weight_none );
2025             i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2026         }
2027         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2028         a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2029         i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2030         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2031         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2032
2033         i_part_cost = a->l0.me8x8[i].cost;
2034         h->mb.i_sub_partition[i] = D_L0_8x8;
2035         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2036         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2037         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2038         a->i_cost8x8bi += i_part_cost;
2039
2040         /* XXX Needed for x264_mb_predict_mv */
2041         x264_mb_cache_mv_b8x8( h, a, i, 0 );
2042     }
2043
2044     /* mb type cost */
2045     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2046 }
2047
2048 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2049 {
2050     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
2051     ALIGNED_4( int16_t mvc[3][2] );
2052
2053     h->mb.i_partition = D_16x8;
2054     a->i_cost16x8bi = 0;
2055
2056     for( int i = 0; i < 2; i++ )
2057     {
2058         int i_part_cost;
2059         int i_part_cost_bi = 0;
2060         int stride[2] = {16,16};
2061         uint8_t *src[2];
2062         x264_me_t m;
2063         m.i_pixel = PIXEL_16x8;
2064         LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2065
2066         for( int l = 0; l < 2; l++ )
2067         {
2068             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2069             int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2070             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2071             lX->me16x8[i].cost = INT_MAX;
2072             for( int j = 0; j < i_ref8s; j++ )
2073             {
2074                 int i_ref = ref8[j];
2075                 m.i_ref_cost = REF_COST( l, i_ref );;
2076
2077                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2078
2079                 CP32( mvc[0], lX->mvc[i_ref][0] );
2080                 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2081                 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2082
2083                 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2084                 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2085                 x264_me_search( h, &m, mvc, 3 );
2086                 m.cost += m.i_ref_cost;
2087
2088                 if( m.cost < lX->me16x8[i].cost )
2089                     h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2090             }
2091         }
2092
2093         /* BI mode */
2094         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2095                                 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2096         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2097                                 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2098         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2099                                 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2100
2101         i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2102                         + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2103                         + a->l1.me16x8[i].i_ref_cost;
2104
2105         i_part_cost = a->l0.me16x8[i].cost;
2106         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2107
2108         if( a->l1.me16x8[i].cost < i_part_cost )
2109         {
2110             i_part_cost = a->l1.me16x8[i].cost;
2111             a->i_mb_partition16x8[i] = D_L1_8x8;
2112         }
2113         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2114         {
2115             i_part_cost = i_part_cost_bi;
2116             a->i_mb_partition16x8[i] = D_BI_8x8;
2117         }
2118         a->i_cost16x8bi += i_part_cost;
2119
2120         /* Early termination based on the current SATD score of partition[0]
2121            plus the estimated SATD score of partition[1] */
2122         if( !i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2123             * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2124         {
2125             a->i_cost16x8bi = COST_MAX;
2126             return;
2127         }
2128
2129         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2130     }
2131
2132     /* mb type cost */
2133     a->i_mb_type16x8 = B_L0_L0
2134         + (a->i_mb_partition16x8[0]>>2) * 3
2135         + (a->i_mb_partition16x8[1]>>2);
2136     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2137 }
2138
2139 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2140 {
2141     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2142     ALIGNED_4( int16_t mvc[3][2] );
2143
2144     h->mb.i_partition = D_8x16;
2145     a->i_cost8x16bi = 0;
2146
2147     for( int i = 0; i < 2; i++ )
2148     {
2149         int i_part_cost;
2150         int i_part_cost_bi = 0;
2151         int stride[2] = {8,8};
2152         uint8_t *src[2];
2153         x264_me_t m;
2154         m.i_pixel = PIXEL_8x16;
2155         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2156
2157         for( int l = 0; l < 2; l++ )
2158         {
2159             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2160             int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2161             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2162             lX->me8x16[i].cost = INT_MAX;
2163             for( int j = 0; j < i_ref8s; j++ )
2164             {
2165                 int i_ref = ref8[j];
2166                 m.i_ref_cost = REF_COST( l, i_ref );
2167
2168                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2169
2170                 CP32( mvc[0], lX->mvc[i_ref][0] );
2171                 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2172                 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2173
2174                 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2175                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2176                 x264_me_search( h, &m, mvc, 3 );
2177                 m.cost += m.i_ref_cost;
2178
2179                 if( m.cost < lX->me8x16[i].cost )
2180                     h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2181             }
2182         }
2183
2184         /* BI mode */
2185         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2186                                 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2187         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2188                                 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2189         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2190
2191         i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2192                         + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2193                         + a->l1.me8x16[i].i_ref_cost;
2194
2195         i_part_cost = a->l0.me8x16[i].cost;
2196         a->i_mb_partition8x16[i] = D_L0_8x8;
2197
2198         if( a->l1.me8x16[i].cost < i_part_cost )
2199         {
2200             i_part_cost = a->l1.me8x16[i].cost;
2201             a->i_mb_partition8x16[i] = D_L1_8x8;
2202         }
2203         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2204         {
2205             i_part_cost = i_part_cost_bi;
2206             a->i_mb_partition8x16[i] = D_BI_8x8;
2207         }
2208         a->i_cost8x16bi += i_part_cost;
2209
2210         /* Early termination based on the current SATD score of partition[0]
2211            plus the estimated SATD score of partition[1] */
2212         if( !i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2213             * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2214         {
2215             a->i_cost8x16bi = COST_MAX;
2216             return;
2217         }
2218
2219         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2220     }
2221
2222     /* mb type cost */
2223     a->i_mb_type8x16 = B_L0_L0
2224         + (a->i_mb_partition8x16[0]>>2) * 3
2225         + (a->i_mb_partition8x16[1]>>2);
2226     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2227 }
2228
2229 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2230 {
2231     int thresh = i_satd * 5/4;
2232
2233     h->mb.i_type = P_L0;
2234     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2235     {
2236         h->mb.i_partition = D_16x16;
2237         x264_analyse_update_cache( h, a );
2238         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2239     }
2240
2241     if( a->l0.i_cost16x8 <= thresh )
2242     {
2243         h->mb.i_partition = D_16x8;
2244         x264_analyse_update_cache( h, a );
2245         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2246     }
2247     else
2248         a->l0.i_cost16x8 = COST_MAX;
2249
2250     if( a->l0.i_cost8x16 <= thresh )
2251     {
2252         h->mb.i_partition = D_8x16;
2253         x264_analyse_update_cache( h, a );
2254         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2255     }
2256     else
2257         a->l0.i_cost8x16 = COST_MAX;
2258
2259     if( a->l0.i_cost8x8 <= thresh )
2260     {
2261         h->mb.i_type = P_8x8;
2262         h->mb.i_partition = D_8x8;
2263         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2264         {
2265             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2266             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2267             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2268             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2269             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2270              * for future blocks are those left over from previous RDO calls. */
2271             for( int i = 0; i < 4; i++ )
2272             {
2273                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2274                 int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2275                 int subtype, btype = D_L0_8x8;
2276                 uint64_t bcost = COST_MAX64;
2277                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2278                 {
2279                     uint64_t cost;
2280                     if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2281                         continue;
2282                     h->mb.i_sub_partition[i] = subtype;
2283                     x264_mb_cache_mv_p8x8( h, a, i );
2284                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2285                     COPY2_IF_LT( bcost, cost, btype, subtype );
2286                 }
2287                 if( h->mb.i_sub_partition[i] != btype )
2288                 {
2289                     h->mb.i_sub_partition[i] = btype;
2290                     x264_mb_cache_mv_p8x8( h, a, i );
2291                 }
2292             }
2293         }
2294         else
2295             x264_analyse_update_cache( h, a );
2296         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2297     }
2298     else
2299         a->l0.i_cost8x8 = COST_MAX;
2300 }
2301
2302 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2303 {
2304     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2305
2306     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2307     {
2308         h->mb.i_type = B_DIRECT;
2309         /* Assumes direct/skip MC is still in fdec */
2310         /* Requires b-rdo to be done before intra analysis */
2311         h->mb.b_skip_mc = 1;
2312         x264_analyse_update_cache( h, a );
2313         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2314         h->mb.b_skip_mc = 0;
2315     }
2316
2317     //FIXME not all the update_cache calls are needed
2318     h->mb.i_partition = D_16x16;
2319     /* L0 */
2320     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2321     {
2322         h->mb.i_type = B_L0_L0;
2323         x264_analyse_update_cache( h, a );
2324         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2325     }
2326
2327     /* L1 */
2328     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2329     {
2330         h->mb.i_type = B_L1_L1;
2331         x264_analyse_update_cache( h, a );
2332         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2333     }
2334
2335     /* BI */
2336     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2337     {
2338         h->mb.i_type = B_BI_BI;
2339         x264_analyse_update_cache( h, a );
2340         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2341     }
2342
2343     /* 8x8 */
2344     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2345     {
2346         h->mb.i_type = B_8x8;
2347         h->mb.i_partition = D_8x8;
2348         x264_analyse_update_cache( h, a );
2349         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2350         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2351     }
2352
2353     /* 16x8 */
2354     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2355     {
2356         h->mb.i_type = a->i_mb_type16x8;
2357         h->mb.i_partition = D_16x8;
2358         x264_analyse_update_cache( h, a );
2359         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2360     }
2361
2362     /* 8x16 */
2363     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2364     {
2365         h->mb.i_type = a->i_mb_type8x16;
2366         h->mb.i_partition = D_8x16;
2367         x264_analyse_update_cache( h, a );
2368         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2369     }
2370 }
2371
2372 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2373 {
2374     int i_biweight;
2375
2376     if( IS_INTRA(h->mb.i_type) )
2377         return;
2378
2379     switch( h->mb.i_partition )
2380     {
2381         case D_16x16:
2382             if( h->mb.i_type == B_BI_BI )
2383             {
2384                 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2385                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2386             }
2387             break;
2388         case D_16x8:
2389             for( int i = 0; i < 2; i++ )
2390                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2391                 {
2392                     i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2393                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2394                 }
2395             break;
2396         case D_8x16:
2397             for( int i = 0; i < 2; i++ )
2398                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2399                 {
2400                     i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2401                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2402                 }
2403             break;
2404         case D_8x8:
2405             for( int i = 0; i < 4; i++ )
2406                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2407                 {
2408                     i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2409                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2410                 }
2411             break;
2412     }
2413 }
2414
2415 static inline void x264_mb_analyse_transform( x264_t *h )
2416 {
2417     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2418     {
2419         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2420         x264_mb_mc( h );
2421
2422         int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2423                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2424         int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2425                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2426
2427         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2428         h->mb.b_skip_mc = 1;
2429     }
2430 }
2431
2432 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2433 {
2434     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2435     {
2436         x264_analyse_update_cache( h, a );
2437         h->mb.b_transform_8x8 ^= 1;
2438         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2439         int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2440
2441         if( *i_rd >= i_rd8 )
2442         {
2443             if( *i_rd > 0 )
2444                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2445             *i_rd = i_rd8;
2446         }
2447         else
2448             h->mb.b_transform_8x8 ^= 1;
2449     }
2450 }
2451
2452 /* Rate-distortion optimal QP selection.
2453  * FIXME: More than half of the benefit of this function seems to be
2454  * in the way it improves the coding of chroma DC (by decimating or
2455  * finding a better way to code a single DC coefficient.)
2456  * There must be a more efficient way to get that portion of the benefit
2457  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2458  * trick. */
2459 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2460 {
2461     int bcost, cost, failures, prevcost, origcost;
2462     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2463     int last_qp_tried = 0;
2464     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2465     int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2466
2467     /* If CBP is already zero, don't raise the quantizer any higher. */
2468     for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2469     {
2470         /* Without psy-RD, require monotonicity when moving quant away from previous
2471          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2472          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2473          * allow 2 failures when moving quant towards previous quant.
2474          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2475         int threshold = (!!h->mb.i_psy_rd);
2476         /* Raise the threshold for failures if we're moving towards the last QP. */
2477         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2478             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2479             threshold++;
2480         h->mb.i_qp = orig_qp;
2481         failures = 0;
2482         prevcost = origcost;
2483
2484         /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2485          * (up to a point) will too.  So, jump down to where the threshold will kick in
2486          * and check the QP there.  If the CBP is still empty, skip the main loop.
2487          * If it isn't empty, we would have ended up having to check this QP anyways,
2488          * so as long as we store it for later lookup, we lose nothing. */
2489         int already_checked_qp = -1;
2490         int already_checked_cost = COST_MAX;
2491         if( direction == -1 )
2492         {
2493             if( !origcbp )
2494             {
2495                 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2496                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2497                 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2498                 if( !h->mb.cbp[h->mb.i_mb_xy] )
2499                 {
2500                     /* If our empty-CBP block is lower QP than the last QP,
2501                      * the last QP almost surely doesn't have a CBP either. */
2502                     if( h->mb.i_last_qp > h->mb.i_qp )
2503                         last_qp_tried = 1;
2504                     break;
2505                 }
2506                 already_checked_qp = h->mb.i_qp;
2507                 h->mb.i_qp = orig_qp;
2508             }
2509         }
2510
2511         h->mb.i_qp += direction;
2512         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2513         {
2514             if( h->mb.i_last_qp == h->mb.i_qp )
2515                 last_qp_tried = 1;
2516             if( h->mb.i_qp == already_checked_qp )
2517                 cost = already_checked_cost;
2518             else
2519             {
2520                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2521                 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2522                 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2523             }
2524
2525             /* We can't assume that the costs are monotonic over QPs.
2526              * Tie case-as-failure seems to give better results. */
2527             if( cost < prevcost )
2528                 failures = 0;
2529             else
2530                 failures++;
2531             prevcost = cost;
2532
2533             if( failures > threshold )
2534                 break;
2535             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2536                 break;
2537             h->mb.i_qp += direction;
2538         }
2539     }
2540
2541     /* Always try the last block's QP. */
2542     if( !last_qp_tried )
2543     {
2544         h->mb.i_qp = h->mb.i_last_qp;
2545         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2546         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2547         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2548     }
2549
2550     h->mb.i_qp = bqp;
2551     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2552
2553     /* Check transform again; decision from before may no longer be optimal. */
2554     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2555         x264_mb_transform_8x8_allowed( h ) )
2556     {
2557         h->mb.b_transform_8x8 ^= 1;
2558         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2559         if( cost > bcost )
2560             h->mb.b_transform_8x8 ^= 1;
2561     }
2562 }
2563
2564 /*****************************************************************************
2565  * x264_macroblock_analyse:
2566  *****************************************************************************/
2567 void x264_macroblock_analyse( x264_t *h )
2568 {
2569     x264_mb_analysis_t analysis;
2570     int i_cost = COST_MAX;
2571
2572     h->mb.i_qp = x264_ratecontrol_qp( h );
2573     if( h->param.rc.i_aq_mode )
2574     {
2575         x264_adaptive_quant( h );
2576         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2577          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2578         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2579             h->mb.i_qp = h->mb.i_last_qp;
2580     }
2581
2582     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2583
2584     /*--------------------------- Do the analysis ---------------------------*/
2585     if( h->sh.i_type == SLICE_TYPE_I )
2586     {
2587 intra_analysis:
2588         if( analysis.i_mbrd )
2589             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2590         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2591         if( analysis.i_mbrd )
2592             x264_intra_rd( h, &analysis, COST_MAX );
2593
2594         i_cost = analysis.i_satd_i16x16;
2595         h->mb.i_type = I_16x16;
2596         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2597         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2598         if( analysis.i_satd_pcm < i_cost )
2599             h->mb.i_type = I_PCM;
2600
2601         else if( analysis.i_mbrd >= 2 )
2602             x264_intra_rd_refine( h, &analysis );
2603     }
2604     else if( h->sh.i_type == SLICE_TYPE_P )
2605     {
2606         int b_skip = 0;
2607
2608         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2609
2610         analysis.b_try_skip = 0;
2611         if( analysis.b_force_intra )
2612         {
2613             if( !h->param.analyse.b_psy )
2614             {
2615                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2616                 goto intra_analysis;
2617             }
2618         }
2619         else
2620         {
2621             /* Fast P_SKIP detection */
2622             if( h->param.analyse.b_fast_pskip )
2623             {
2624                 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2625                     // FIXME don't need to check this if the reference frame is done
2626                     {}
2627                 else if( h->param.analyse.i_subpel_refine >= 3 )
2628                     analysis.b_try_skip = 1;
2629                 else if( h->mb.i_mb_type_left == P_SKIP ||
2630                          h->mb.i_mb_type_top == P_SKIP ||
2631                          h->mb.i_mb_type_topleft == P_SKIP ||
2632                          h->mb.i_mb_type_topright == P_SKIP )
2633                     b_skip = x264_macroblock_probe_pskip( h );
2634             }
2635         }
2636
2637         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2638
2639         if( b_skip )
2640         {
2641             h->mb.i_type = P_SKIP;
2642             h->mb.i_partition = D_16x16;
2643             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2644             /* Set up MVs for future predictors */
2645             if( b_skip )
2646                 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2647                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2648         }
2649         else
2650         {
2651             const unsigned int flags = h->param.analyse.inter;
2652             int i_type;
2653             int i_partition;
2654             int i_thresh16x8;
2655             int i_satd_inter, i_satd_intra;
2656
2657             x264_mb_analyse_load_costs( h, &analysis );
2658
2659             x264_mb_analyse_inter_p16x16( h, &analysis );
2660
2661             if( h->mb.i_type == P_SKIP )
2662             {
2663                 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2664                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2665                 return;
2666             }
2667
2668             if( flags & X264_ANALYSE_PSUB16x16 )
2669             {
2670                 if( h->param.analyse.b_mixed_references )
2671                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2672                 else
2673                     x264_mb_analyse_inter_p8x8( h, &analysis );
2674             }
2675
2676             /* Select best inter mode */
2677             i_type = P_L0;
2678             i_partition = D_16x16;
2679             i_cost = analysis.l0.me16x16.cost;
2680
2681             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2682                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2683             {
2684                 i_type = P_8x8;
2685                 i_partition = D_8x8;
2686                 i_cost = analysis.l0.i_cost8x8;
2687
2688                 /* Do sub 8x8 */
2689                 if( flags & X264_ANALYSE_PSUB8x8 )
2690                 {
2691                     for( int i = 0; i < 4; i++ )
2692                     {
2693                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2694                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2695                         {
2696                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2697                             h->mb.i_sub_partition[i] = D_L0_4x4;
2698
2699                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2700                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2701                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2702
2703                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2704                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2705                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2706
2707                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2708                         }
2709                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2710                     }
2711                     analysis.l0.i_cost8x8 = i_cost;
2712                 }
2713             }
2714
2715             /* Now do 16x8/8x16 */
2716             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2717             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2718                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2719             {
2720                 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
2721                                       + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2722                 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2723
2724                 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
2725                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2726
2727                 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
2728                                   + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2729                 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2730
2731                 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
2732                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2733             }
2734
2735             h->mb.i_partition = i_partition;
2736
2737             /* refine qpel */
2738             //FIXME mb_type costs?
2739             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2740             {
2741                 /* refine later */
2742             }
2743             else if( i_partition == D_16x16 )
2744             {
2745                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2746                 i_cost = analysis.l0.me16x16.cost;
2747             }
2748             else if( i_partition == D_16x8 )
2749             {
2750                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2751                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2752                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2753             }
2754             else if( i_partition == D_8x16 )
2755             {
2756                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2757                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2758                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2759             }
2760             else if( i_partition == D_8x8 )
2761             {
2762                 i_cost = 0;
2763                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2764                 {
2765                     switch( h->mb.i_sub_partition[i8x8] )
2766                     {
2767                         case D_L0_8x8:
2768                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2769                             i_cost += analysis.l0.me8x8[i8x8].cost;
2770                             break;
2771                         case D_L0_8x4:
2772                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2773                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2774                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2775                                       analysis.l0.me8x4[i8x8][1].cost;
2776                             break;
2777                         case D_L0_4x8:
2778                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2779                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2780                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2781                                       analysis.l0.me4x8[i8x8][1].cost;
2782                             break;
2783
2784                         case D_L0_4x4:
2785                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2786                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2787                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2788                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2789                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2790                                       analysis.l0.me4x4[i8x8][1].cost +
2791                                       analysis.l0.me4x4[i8x8][2].cost +
2792                                       analysis.l0.me4x4[i8x8][3].cost;
2793                             break;
2794                         default:
2795                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2796                             break;
2797                     }
2798                 }
2799             }
2800
2801             if( h->mb.b_chroma_me )
2802             {
2803                 x264_mb_analyse_intra_chroma( h, &analysis );
2804                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2805                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2806                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2807                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2808             }
2809             else
2810                 x264_mb_analyse_intra( h, &analysis, i_cost );
2811
2812             i_satd_inter = i_cost;
2813             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2814                                       analysis.i_satd_i8x8,
2815                                       analysis.i_satd_i4x4 );
2816
2817             if( analysis.i_mbrd )
2818             {
2819                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2820                 i_type = P_L0;
2821                 i_partition = D_16x16;
2822                 i_cost = analysis.l0.i_rd16x16;
2823                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2824                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2825                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2826                 h->mb.i_type = i_type;
2827                 h->mb.i_partition = i_partition;
2828                 if( i_cost < COST_MAX )
2829                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2830                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2831             }
2832
2833             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2834             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2835             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2836             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2837
2838             h->mb.i_type = i_type;
2839
2840             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2841             {
2842                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2843                  * it was an inter block. */
2844                 x264_analyse_update_cache( h, &analysis );
2845                 x264_macroblock_encode( h );
2846                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2847                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2848                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2849                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2850                 goto intra_analysis;
2851             }
2852
2853             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2854             {
2855                 if( IS_INTRA( h->mb.i_type ) )
2856                 {
2857                     x264_intra_rd_refine( h, &analysis );
2858                 }
2859                 else if( i_partition == D_16x16 )
2860                 {
2861                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2862                     analysis.l0.me16x16.cost = i_cost;
2863                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2864                 }
2865                 else if( i_partition == D_16x8 )
2866                 {
2867                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2868                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2869                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2870                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2871                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2872                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2873                 }
2874                 else if( i_partition == D_8x16 )
2875                 {
2876                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2877                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2878                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2879                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2880                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2881                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2882                 }
2883                 else if( i_partition == D_8x8 )
2884                 {
2885                     x264_analyse_update_cache( h, &analysis );
2886                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2887                     {
2888                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2889                         {
2890                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2891                         }
2892                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2893                         {
2894                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2895                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2896                         }
2897                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2898                         {
2899                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2900                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2901                         }
2902                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2903                         {
2904                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2905                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2906                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2907                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2908                         }
2909                     }
2910                 }
2911             }
2912         }
2913     }
2914     else if( h->sh.i_type == SLICE_TYPE_B )
2915     {
2916         int i_bskip_cost = COST_MAX;
2917         int b_skip = 0;
2918
2919         if( analysis.i_mbrd )
2920             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2921
2922         h->mb.i_type = B_SKIP;
2923         if( h->mb.b_direct_auto_write )
2924         {
2925             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2926             for( int i = 0; i < 2; i++ )
2927             {
2928                 int b_changed = 1;
2929                 h->sh.b_direct_spatial_mv_pred ^= 1;
2930                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2931                 if( analysis.b_direct_available )
2932                 {
2933                     if( b_changed )
2934                     {
2935                         x264_mb_mc( h );
2936                         b_skip = x264_macroblock_probe_bskip( h );
2937                     }
2938                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2939                 }
2940                 else
2941                     b_skip = 0;
2942             }
2943         }
2944         else
2945             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2946
2947         analysis.b_try_skip = 0;
2948         if( analysis.b_direct_available )
2949         {
2950             if( !h->mb.b_direct_auto_write )
2951                 x264_mb_mc( h );
2952             if( analysis.i_mbrd )
2953             {
2954                 i_bskip_cost = ssd_mb( h );
2955                 /* 6 = minimum cavlc cost of a non-skipped MB */
2956                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2957             }
2958             else if( !h->mb.b_direct_auto_write )
2959             {
2960                 /* Conditioning the probe on neighboring block types
2961                  * doesn't seem to help speed or quality. */
2962                 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2963                 if( h->param.analyse.i_subpel_refine < 3 )
2964                     b_skip = analysis.b_try_skip;
2965             }
2966             /* Set up MVs for future predictors */
2967             if( b_skip )
2968             {
2969                 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2970                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2971                 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
2972                     M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2973             }
2974         }
2975
2976         if( !b_skip )
2977         {
2978             const unsigned int flags = h->param.analyse.inter;
2979             int i_type;
2980             int i_partition;
2981             int i_satd_inter;
2982             h->mb.b_skip_mc = 0;
2983             h->mb.i_type = B_DIRECT;
2984
2985             x264_mb_analyse_load_costs( h, &analysis );
2986
2987             /* select best inter mode */
2988             /* direct must be first */
2989             if( analysis.b_direct_available )
2990                 x264_mb_analyse_inter_direct( h, &analysis );
2991
2992             x264_mb_analyse_inter_b16x16( h, &analysis );
2993
2994             if( h->mb.i_type == B_SKIP )
2995             {
2996                 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2997                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2998                 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
2999                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3000                 return;
3001             }
3002
3003             i_type = B_L0_L0;
3004             i_partition = D_16x16;
3005             i_cost = analysis.l0.me16x16.cost;
3006             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3007             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3008             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3009
3010             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
3011             {
3012                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3013                 if( i_bskip_cost < analysis.i_rd16x16direct &&
3014                     i_bskip_cost < analysis.i_rd16x16bi &&
3015                     i_bskip_cost < analysis.l0.i_rd16x16 &&
3016                     i_bskip_cost < analysis.l1.i_rd16x16 )
3017                 {
3018                     h->mb.i_type = B_SKIP;
3019                     x264_analyse_update_cache( h, &analysis );
3020                     return;
3021                 }
3022             }
3023
3024             if( flags & X264_ANALYSE_BSUB16x16 )
3025             {
3026                 if( h->param.analyse.b_mixed_references )
3027                     x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3028                 else
3029                     x264_mb_analyse_inter_b8x8( h, &analysis );
3030
3031                 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3032
3033                 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3034                 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3035                 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3036                 for( int i = 0; i < 2; i++ )
3037                 {
3038                     int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3039                     int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3040                     // 16x8
3041                     i_best_cost = COST_MAX;
3042                     i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3043                     i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3044                     i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3045                     avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3046                                          + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3047                     avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3048                                          + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3049                     COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3050                     COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3051                     COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3052                     analysis.i_cost_est16x8[i] = i_best_cost;
3053
3054                     // 8x16
3055                     i_best_cost = COST_MAX;
3056                     i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3057                     i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3058                     i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3059                     avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3060                                          + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3061                     avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3062                                          + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3063                     COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3064                     COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3065                     COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3066                     analysis.i_cost_est8x16[i] = i_best_cost;
3067                 }
3068                 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3069                 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3070                 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3071                 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3072                 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3073                 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3074
3075                 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3076                 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3077                 if( try_16x8_first && i_cost_est16x8bi_total < i_cost )
3078                 {
3079                     x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3080                     COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3081                 }
3082                 if( i_cost_est8x16bi_total < i_cost )
3083                 {
3084                     x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3085                     COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3086                 }
3087                 if( !try_16x8_first && i_cost_est16x8bi_total < i_cost )
3088                 {
3089                     x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3090                     COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3091                 }
3092             }
3093
3094             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3095             {
3096                 /* refine later */
3097             }
3098             /* refine qpel */
3099             else if( i_partition == D_16x16 )
3100             {
3101                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3102                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3103                 if( i_type == B_L0_L0 )
3104                 {
3105                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3106                     i_cost = analysis.l0.me16x16.cost
3107                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3108                 }
3109                 else if( i_type == B_L1_L1 )
3110                 {
3111                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3112                     i_cost = analysis.l1.me16x16.cost
3113                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3114                 }
3115                 else if( i_type == B_BI_BI )
3116                 {
3117                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3118                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3119                 }
3120             }
3121             else if( i_partition == D_16x8 )
3122             {
3123                 for( int i = 0; i < 2; i++ )
3124                 {
3125                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3126                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3127                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3128                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3129                 }
3130             }
3131             else if( i_partition == D_8x16 )
3132             {
3133                 for( int i = 0; i < 2; i++ )
3134                 {
3135                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3136                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3137                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3138                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3139                 }
3140             }
3141             else if( i_partition == D_8x8 )
3142             {
3143                 for( int i = 0; i < 4; i++ )
3144                 {
3145                     x264_me_t *m;
3146                     int i_part_cost_old;
3147                     int i_type_cost;
3148                     int i_part_type = h->mb.i_sub_partition[i];
3149                     int b_bidir = (i_part_type == D_BI_8x8);
3150
3151                     if( i_part_type == D_DIRECT_8x8 )
3152                         continue;
3153                     if( x264_mb_partition_listX_table[0][i_part_type] )
3154                     {
3155                         m = &analysis.l0.me8x8[i];
3156                         i_part_cost_old = m->cost;
3157                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3158                         m->cost -= i_type_cost;
3159                         x264_me_refine_qpel( h, m );
3160                         if( !b_bidir )
3161                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3162                     }
3163                     if( x264_mb_partition_listX_table[1][i_part_type] )
3164                     {
3165                         m = &analysis.l1.me8x8[i];
3166                         i_part_cost_old = m->cost;
3167                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3168                         m->cost -= i_type_cost;
3169                         x264_me_refine_qpel( h, m );
3170                         if( !b_bidir )
3171                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3172                     }
3173                     /* TODO: update mvp? */
3174                 }
3175             }
3176
3177             i_satd_inter = i_cost;
3178
3179             if( analysis.i_mbrd )
3180             {
3181                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3182                 i_type = B_SKIP;
3183                 i_cost = i_bskip_cost;
3184                 i_partition = D_16x16;
3185                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3186                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3187                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3188                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3189                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3190                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3191                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3192
3193                 h->mb.i_type = i_type;
3194                 h->mb.i_partition = i_partition;
3195             }
3196
3197             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3198
3199             if( analysis.i_mbrd )
3200             {
3201                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3202                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3203             }
3204
3205             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3206             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3207             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3208             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3209
3210             h->mb.i_type = i_type;
3211             h->mb.i_partition = i_partition;
3212
3213             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3214                 x264_intra_rd_refine( h, &analysis );
3215             if( h->mb.i_subpel_refine >= 5 )
3216                 x264_refine_bidir( h, &analysis );
3217
3218             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3219             {
3220                 int i_biweight;
3221                 x264_analyse_update_cache( h, &analysis );
3222
3223                 if( i_partition == D_16x16 )
3224                 {
3225                     if( i_type == B_L0_L0 )
3226                     {
3227                         analysis.l0.me16x16.cost = i_cost;
3228                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3229                     }
3230                     else if( i_type == B_L1_L1 )
3231                     {
3232                         analysis.l1.me16x16.cost = i_cost;
3233                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3234                     }
3235                     else if( i_type == B_BI_BI )
3236                     {
3237                         i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3238                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3239                     }
3240                 }
3241                 else if( i_partition == D_16x8 )
3242                 {
3243                     for( int i = 0; i < 2; i++ )
3244                     {
3245                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3246                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3247                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3248                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3249                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3250                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3251                         {
3252                             i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3253                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3254                         }
3255                     }
3256                 }
3257                 else if( i_partition == D_8x16 )
3258                 {
3259                     for( int i = 0; i < 2; i++ )
3260                     {
3261                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3262                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3263                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3264                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3265                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3266                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3267                         {
3268                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3269                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3270                         }
3271                     }
3272                 }
3273                 else if( i_partition == D_8x8 )
3274                 {
3275                     for( int i = 0; i < 4; i++ )
3276                     {
3277                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3278                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3279                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3280                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3281                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3282                         {
3283                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3284                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3285                         }
3286                     }
3287                 }
3288             }
3289         }
3290     }
3291
3292     x264_analyse_update_cache( h, &analysis );
3293
3294     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3295      * without realizing it.  Check for this and account for it if necessary. */
3296     if( analysis.i_mbrd >= 2 )
3297     {
3298         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3299         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3300         int list = check_mv_lists[h->mb.i_type] - 1;
3301         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3302             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3303             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3304                 h->mb.i_partition = D_16x16;
3305     }
3306
3307     if( !analysis.i_mbrd )
3308         x264_mb_analyse_transform( h );
3309
3310     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3311         x264_mb_analyse_qp_rd( h, &analysis );
3312
3313     h->mb.b_trellis = h->param.analyse.i_trellis;
3314     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3315     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3316         x264_psy_trellis_init( h, 0 );
3317     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3318         h->mb.i_skip_intra = 0;
3319 }
3320
3321 /*-------------------- Update MB from the analysis ----------------------*/
3322 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3323 {
3324     switch( h->mb.i_type )
3325     {
3326         case I_4x4:
3327             for( int i = 0; i < 16; i++ )
3328                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3329
3330             x264_mb_analyse_intra_chroma( h, a );
3331             break;
3332         case I_8x8:
3333             for( int i = 0; i < 4; i++ )
3334                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3335
3336             x264_mb_analyse_intra_chroma( h, a );
3337             break;
3338         case I_16x16:
3339             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3340             x264_mb_analyse_intra_chroma( h, a );
3341             break;
3342
3343         case I_PCM:
3344             break;
3345
3346         case P_L0:
3347             switch( h->mb.i_partition )
3348             {
3349                 case D_16x16:
3350                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3351                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3352                     break;
3353
3354                 case D_16x8:
3355                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3356                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3357                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3358                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3359                     break;
3360
3361                 case D_8x16:
3362                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3363                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3364                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3365                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3366                     break;
3367
3368                 default:
3369                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3370                     break;
3371             }
3372             break;
3373
3374         case P_8x8:
3375             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3376             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3377             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3378             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3379             for( int i = 0; i < 4; i++ )
3380                 x264_mb_cache_mv_p8x8( h, a, i );
3381             break;
3382
3383         case P_SKIP:
3384         {
3385             h->mb.i_partition = D_16x16;
3386             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3387             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3388             break;
3389         }
3390
3391         case B_SKIP:
3392         case B_DIRECT:
3393             h->mb.i_partition = h->mb.cache.direct_partition;
3394             x264_mb_load_mv_direct8x8( h, 0 );
3395             x264_mb_load_mv_direct8x8( h, 1 );
3396             x264_mb_load_mv_direct8x8( h, 2 );
3397             x264_mb_load_mv_direct8x8( h, 3 );
3398             break;
3399
3400         case B_8x8:
3401             /* optimize: cache might not need to be rewritten */
3402             for( int i = 0; i < 4; i++ )
3403                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3404             break;
3405
3406         default: /* the rest of the B types */
3407             switch( h->mb.i_partition )
3408             {
3409             case D_16x16:
3410                 switch( h->mb.i_type )
3411                 {
3412                 case B_L0_L0:
3413                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3414                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3415
3416                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3417                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3418                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3419                     break;
3420                 case B_L1_L1:
3421                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3422                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3423                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3424
3425                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3426                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3427                     break;
3428                 case B_BI_BI:
3429                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3430                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3431
3432                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3433                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3434                     break;
3435                 }
3436                 break;
3437             case D_16x8:
3438                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3439                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3440                 break;
3441             case D_8x16:
3442                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3443                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3444                 break;
3445             default:
3446                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3447                 break;
3448             }
3449     }
3450
3451 #ifndef NDEBUG
3452     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3453     {
3454         for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3455         {
3456             int completed;
3457             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3458             if( ref < 0 )
3459                 continue;
3460             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3461             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3462             {
3463                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3464                 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3465                 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3466                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3467                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3468                 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3469                 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3470                 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3471                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3472                 x264_mb_analyse_intra( h, a, COST_MAX );
3473                 h->mb.i_type = I_16x16;
3474                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3475                 x264_mb_analyse_intra_chroma( h, a );
3476             }
3477         }
3478     }
3479 #endif
3480 }
3481
3482 #include "slicetype.c"
3483