git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int i_ref;
  41     int       i_rd16x16;
  42     x264_me_t me16x16;
  43     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  44
  45     /* 8x8 */
  46     int       i_cost8x8;
  47     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  48     ALIGNED_4( int16_t mvc[32][5][2] );
  49     x264_me_t me8x8[4];
  50
  51     /* Sub 4x4 */
  52     int       i_cost4x4[4]; /* cost per 8x8 partition */
  53     x264_me_t me4x4[4][4];
  54
  55     /* Sub 8x4 */
  56     int       i_cost8x4[4]; /* cost per 8x8 partition */
  57     x264_me_t me8x4[4][2];
  58
  59     /* Sub 4x8 */
  60     int       i_cost4x8[4]; /* cost per 8x8 partition */
  61     x264_me_t me4x8[4][2];
  62
  63     /* 16x8 */
  64     int       i_cost16x8;
  65     x264_me_t me16x8[2];
  66
  67     /* 8x16 */
  68     int       i_cost8x16;
  69     x264_me_t me8x16[2];
  70
  71 } x264_mb_analysis_list_t;
  72
  73 typedef struct
  74 {
  75     /* conduct the analysis using this lamda and QP */
  76     int i_lambda;
  77     int i_lambda2;
  78     int i_qp;
  79     uint16_t *p_cost_mv;
  80     uint16_t *p_cost_ref[2];
  81     int i_mbrd;
  82
  83
  84     /* I: Intra part */
  85     /* Take some shortcuts in intra search if intra is deemed unlikely */
  86     int b_fast_intra;
  87     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  88     int b_try_pskip;
  89
  90     /* Luma part */
  91     int i_satd_i16x16;
  92     int i_satd_i16x16_dir[7];
  93     int i_predict16x16;
  94
  95     int i_satd_i8x8;
  96     int i_cbp_i8x8_luma;
  97     int i_satd_i8x8_dir[12][4];
  98     int i_predict8x8[4];
  99
 100     int i_satd_i4x4;
 101     int i_predict4x4[16];
 102
 103     int i_satd_pcm;
 104
 105     /* Chroma part */
 106     int i_satd_i8x8chroma;
 107     int i_satd_i8x8chroma_dir[7];
 108     int i_predict8x8chroma;
 109
 110     /* II: Inter part P/B frame */
 111     x264_mb_analysis_list_t l0;
 112     x264_mb_analysis_list_t l1;
 113
 114     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 115     int i_cost16x16direct;
 116     int i_cost8x8bi;
 117     int i_cost8x8direct[4];
 118     int i_cost16x8bi;
 119     int i_cost8x16bi;
 120     int i_rd16x16bi;
 121     int i_rd16x16direct;
 122     int i_rd16x8bi;
 123     int i_rd8x16bi;
 124     int i_rd8x8bi;
 125
 126     int i_mb_partition16x8[2]; /* mb_partition_e */
 127     int i_mb_partition8x16[2];
 128     int i_mb_type16x8; /* mb_class_e */
 129     int i_mb_type8x16;
 130
 131     int b_direct_available;
 132
 133 } x264_mb_analysis_t;
 134
 135 /* lambda = pow(2,qp/6-2) */
 136 const uint8_t x264_lambda_tab[52] = {
 137    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 138    1, 1, 1, 1,              /*  8-11 */
 139    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 140    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 141    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 142   16,18,20,23,25,29,32,36,  /* 36-43 */
 143   40,45,51,57,64,72,81,91   /* 44-51 */
 144 };
 145
 146 /* lambda2 = pow(lambda,2) * .9 * 256 */
 147 const int x264_lambda2_tab[52] = {
 148     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 149     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 150    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 151   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 152  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 153 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 154 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 155 };
 156
 157 const uint8_t x264_exp2_lut[64] = {
 158       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 159      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 160     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 161     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 162 };
 163
 164 const float x264_log2_lut[128] = {
 165     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 166     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 167     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 168     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 169     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 170     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 171     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 172     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 173     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 174     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 175     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 176     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 177     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 178     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 179     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 180     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 181 };
 182
 183 /* Avoid an int/float conversion. */
 184 const float x264_log2_lz_lut[32] = {
 185     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 186 };
 187
 188 // should the intra and inter lambdas be different?
 189 // I'm just matching the behaviour of deadzone quant.
 190 static const int x264_trellis_lambda2_tab[2][52] = {
 191     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 192     {    46,      58,      73,      92,     117,     147,
 193         185,     233,     294,     370,     466,     587,
 194         740,     932,    1174,    1480,    1864,    2349,
 195        2959,    3728,    4697,    5918,    7457,    9395,
 196       11837,   14914,   18790,   23674,   29828,   37581,
 197       47349,   59656,   75163,   94699,  119313,  150326,
 198      189399,  238627,  300652,  378798,  477255,  601304,
 199      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 200     3030384, 3818045, 4810435, 6060769 },
 201     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 202     {    27,      34,      43,      54,      68,      86,
 203         108,     136,     172,     216,     273,     343,
 204         433,     545,     687,     865,    1090,    1374,
 205        1731,    2180,    2747,    3461,    4361,    5494,
 206        6922,    8721,   10988,   13844,   17442,   21976,
 207       27688,   34885,   43953,   55377,   69771,   87906,
 208      110755,  139543,  175813,  221511,  279087,  351627,
 209      443023,  558174,  703255,  886046, 1116348, 1406511,
 210     1772093, 2232697, 2813022, 3544186 }
 211 };
 212
 213 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 214        16,    20,    25,    32,    40,    50,
 215        64,    80,   101,   128,   161,   203,
 216       256,   322,   406,   512,   645,   812,
 217      1024,  1290,  1625,  2048,  2580,  3250,
 218      4096,  5160,  6501,  8192, 10321, 13003,
 219     16384, 20642, 26007, 32768, 41285, 52015,
 220     65535
 221 };
 222
 223 /* TODO: calculate CABAC costs */
 224 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 225     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 226 };
 227 static const uint8_t i_mb_b16x8_cost_table[17] = {
 228     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 229 };
 230 static const uint8_t i_sub_mb_b_cost_table[13] = {
 231     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 232 };
 233 static const uint8_t i_sub_mb_p_cost_table[4] = {
 234     5, 3, 3, 1
 235 };
 236
 237 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 238
 239 static uint16_t x264_cost_ref[92][3][33];
 240 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 241
 242 int x264_analyse_init_costs( x264_t *h, int qp )
 243 {
 244     int i, j;
 245     int lambda = x264_lambda_tab[qp];
 246     if( h->cost_mv[lambda] )
 247         return 0;
 248     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 249     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 250     h->cost_mv[lambda] += 2*4*2048;
 251     for( i = 0; i <= 2*4*2048; i++ )
 252     {
 253         h->cost_mv[lambda][-i] =
 254         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 255     }
 256     x264_pthread_mutex_lock( &cost_ref_mutex );
 257     for( i = 0; i < 3; i++ )
 258         for( j = 0; j < 33; j++ )
 259             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 260     x264_pthread_mutex_unlock( &cost_ref_mutex );
 261     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 262     {
 263         for( j=0; j<4; j++ )
 264         {
 265             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 266             h->cost_mv_fpel[lambda][j] += 2*2048;
 267             for( i = -2*2048; i < 2*2048; i++ )
 268                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 269         }
 270     }
 271     return 0;
 272 fail:
 273     return -1;
 274 }
 275
 276 void x264_analyse_free_costs( x264_t *h )
 277 {
 278     int i, j;
 279     for( i = 0; i < 92; i++ )
 280     {
 281         if( h->cost_mv[i] )
 282             x264_free( h->cost_mv[i] - 2*4*2048 );
 283         if( h->cost_mv_fpel[i][0] )
 284             for( j = 0; j < 4; j++ )
 285                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 286     }
 287 }
 288
 289 void x264_analyse_weight_frame( x264_t *h, int end )
 290 {
 291     int j;
 292     for( j=0; j<h->i_ref0; j++ )
 293     {
 294         if( h->sh.weight[j][0].weightfn )
 295         {
 296             x264_frame_t *frame = h->fref0[j];
 297             int width = frame->i_width[0] + 2*PADH;
 298             int i_padv = PADV << h->param.b_interlaced;
 299             int offset, height;
 300             uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 301             int k;
 302             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 303             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 304             h->fenc->i_lines_weighted += height;
 305             if( height )
 306             {
 307                 for( k = j; k < h->i_ref0; k++ )
 308                     if( h->sh.weight[k][0].weightfn )
 309                     {
 310                         uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 311                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 312                                                  src + offset, frame->i_stride[0],
 313                                                  width, height, &h->sh.weight[k][0] );
 314                     }
 315             }
 316             break;
 317         }
 318     }
 319 }
 320
 321 /* initialize an array of lambda*nbits for all possible mvs */
 322 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 323 {
 324     a->p_cost_mv = h->cost_mv[a->i_lambda];
 325     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 326     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 327 }
 328
 329 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 330 {
 331     /* conduct the analysis using this lamda and QP */
 332     a->i_qp = h->mb.i_qp = i_qp;
 333     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 334
 335     a->i_lambda = x264_lambda_tab[i_qp];
 336     a->i_lambda2 = x264_lambda2_tab[i_qp];
 337
 338     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 339     if( h->param.analyse.i_trellis )
 340     {
 341         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 342         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 343         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 344         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 345     }
 346     h->mb.i_psy_rd_lambda = a->i_lambda;
 347     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 348     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 349
 350 }
 351
 352 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 353 {
 354     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 355
 356     /* mbrd == 1 -> RD mode decision */
 357     /* mbrd == 2 -> RD refinement */
 358     /* mbrd == 3 -> QPRD */
 359     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 360
 361     x264_mb_analyse_init_qp( h, a, i_qp );
 362
 363     h->mb.i_me_method = h->param.analyse.i_me_method;
 364     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 365     if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
 366         h->mb.i_subpel_refine--;
 367     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 368                         && h->mb.i_subpel_refine >= 5;
 369     h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
 370                           (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
 371
 372     h->mb.b_transform_8x8 = 0;
 373     h->mb.b_noise_reduction = 0;
 374
 375     /* I: Intra part */
 376     a->i_satd_i16x16 =
 377     a->i_satd_i8x8   =
 378     a->i_satd_i4x4   =
 379     a->i_satd_i8x8chroma = COST_MAX;
 380
 381     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 382     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 383
 384     a->b_fast_intra = 0;
 385     h->mb.i_skip_intra =
 386         h->mb.b_lossless ? 0 :
 387         a->i_mbrd ? 2 :
 388         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 389
 390     /* II: Inter part P/B frame */
 391     if( h->sh.i_type != SLICE_TYPE_I )
 392     {
 393         int i, j;
 394         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 395         // limit motion search to a slightly smaller range than the theoretical limit,
 396         // since the search may go a few iterations past its given range
 397         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 398
 399         /* Calculate max allowed MV range */
 400 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 401         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 402         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 403         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 404         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 405         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 406         {
 407             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 408             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 409             /* If we're left of the refresh bar, don't reference right of it. */
 410             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 411                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 412         }
 413         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 414         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 415         if( h->mb.i_mb_x == 0 )
 416         {
 417             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 418             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 419             int thread_mvy_range = i_fmv_range;
 420
 421             if( h->i_thread_frames > 1 )
 422             {
 423                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 424                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 425                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 426                 {
 427                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 428                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 429                     for( j=0; j<i_ref; j++ )
 430                     {
 431                         x264_frame_cond_wait( fref[j]->orig, thresh );
 432                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 433                     }
 434                 }
 435
 436                 if( h->param.b_deterministic )
 437                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 438                 if( h->mb.b_interlaced )
 439                     thread_mvy_range >>= 1;
 440
 441                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 442             }
 443
 444             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 445             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 446             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 447             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 448             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 449             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 450             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 451         }
 452 #undef CLIP_FMV
 453
 454         a->l0.me16x16.cost =
 455         a->l0.i_rd16x16    =
 456         a->l0.i_cost8x8    = COST_MAX;
 457
 458         for( i = 0; i < 4; i++ )
 459         {
 460             a->l0.i_cost4x4[i] =
 461             a->l0.i_cost8x4[i] =
 462             a->l0.i_cost4x8[i] = COST_MAX;
 463         }
 464
 465         a->l0.i_cost16x8   =
 466         a->l0.i_cost8x16   = COST_MAX;
 467         if( h->sh.i_type == SLICE_TYPE_B )
 468         {
 469             a->l1.me16x16.cost =
 470             a->l1.i_rd16x16    =
 471             a->l1.i_cost8x8    = COST_MAX;
 472
 473             for( i = 0; i < 4; i++ )
 474             {
 475                 a->l1.i_cost4x4[i] =
 476                 a->l1.i_cost8x4[i] =
 477                 a->l1.i_cost4x8[i] =
 478                 a->i_cost8x8direct[i] = COST_MAX;
 479             }
 480
 481             a->l1.i_cost16x8   =
 482             a->l1.i_cost8x16   =
 483             a->i_rd16x16bi     =
 484             a->i_rd16x16direct =
 485             a->i_rd8x8bi       =
 486             a->i_rd16x8bi      =
 487             a->i_rd8x16bi      =
 488             a->i_cost16x16bi   =
 489             a->i_cost16x16direct =
 490             a->i_cost8x8bi     =
 491             a->i_cost16x8bi    =
 492             a->i_cost8x16bi    = COST_MAX;
 493         }
 494
 495         /* Fast intra decision */
 496         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 497         {
 498             if(   IS_INTRA( h->mb.i_mb_type_left )
 499                || IS_INTRA( h->mb.i_mb_type_top )
 500                || IS_INTRA( h->mb.i_mb_type_topleft )
 501                || IS_INTRA( h->mb.i_mb_type_topright )
 502                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 503                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 504             { /* intra is likely */ }
 505             else
 506             {
 507                 a->b_fast_intra = 1;
 508             }
 509         }
 510         h->mb.b_skip_mc = 0;
 511         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 512             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 513         {
 514             a->b_force_intra = 1;
 515             a->b_fast_intra = 0;
 516         }
 517         else
 518             a->b_force_intra = 0;
 519     }
 520 }
 521
 522 /* Prediction modes allowed for various combinations of neighbors. */
 523 /* Terminated by a -1. */
 524 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 525 static const int8_t i16x16_mode_available[5][5] =
 526 {
 527     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 528     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 529     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 530     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 531     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 532 };
 533
 534 static const int8_t i8x8chroma_mode_available[5][5] =
 535 {
 536     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 537     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 538     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 539     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 540     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 541 };
 542
 543 static const int8_t i4x4_mode_available[5][10] =
 544 {
 545     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 546     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 547     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 548     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 549     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 550 };
 551
 552 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
 553 {
 554     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 555     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 556 }
 557
 558 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 559 {
 560     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 561     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 562 }
 563
 564 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
 565 {
 566     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 567     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 568 }
 569
 570 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 571 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 572 {
 573     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 574
 575     if( do_both_dct || h->mb.b_transform_8x8 )
 576         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 577     if( do_both_dct || !h->mb.b_transform_8x8 )
 578         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 579 }
 580
 581 /* Reset fenc satd scores cache for psy RD */
 582 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 583 {
 584     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 585         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 586     if( !h->mb.i_psy_rd )
 587         return;
 588     /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
 589     h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
 590     if( b_satd )
 591         h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 592 }
 593
 594 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 595 {
 596     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 597
 598     if( a->i_satd_i8x8chroma < COST_MAX )
 599         return;
 600
 601     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 602
 603     /* 8x8 prediction selection for chroma */
 604     if( predict_mode[3] >= 0 && b_merged_satd )
 605     {
 606         int satdu[4], satdv[4];
 607         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 608         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 609         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 610         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 611         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 612         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 613
 614         for( ; *predict_mode >= 0; predict_mode++ )
 615         {
 616             int i_mode = *predict_mode;
 617             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 618
 619             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 620             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 621         }
 622     }
 623     else
 624     {
 625         for( ; *predict_mode >= 0; predict_mode++ )
 626         {
 627             int i_satd;
 628             int i_mode = *predict_mode;
 629
 630             /* we do the prediction */
 631             if( h->mb.b_lossless )
 632                 x264_predict_lossless_8x8_chroma( h, i_mode );
 633             else
 634             {
 635                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 636                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 637             }
 638
 639             /* we calculate the cost */
 640             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 641                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 642                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 643
 644             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 645             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 646         }
 647     }
 648
 649     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 650 }
 651
 652 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 653 {
 654     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 655     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 656     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 657
 658     int i, idx;
 659     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 660
 661     /*---------------- Try all mode and calculate their score ---------------*/
 662
 663     /* 16x16 prediction selection */
 664     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 665
 666     if( b_merged_satd && predict_mode[3] >= 0 )
 667     {
 668         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 669         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 670         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 671             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 672         for( i=0; i<4; i++ )
 673         {
 674             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 675             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 676         }
 677     }
 678     else
 679     {
 680         for( ; *predict_mode >= 0; predict_mode++ )
 681         {
 682             int i_satd;
 683             int i_mode = *predict_mode;
 684
 685             if( h->mb.b_lossless )
 686                 x264_predict_lossless_16x16( h, i_mode );
 687             else
 688                 h->predict_16x16[i_mode]( p_dst );
 689
 690             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 691                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 692             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 693             a->i_satd_i16x16_dir[i_mode] = i_satd;
 694         }
 695     }
 696
 697     if( h->sh.i_type == SLICE_TYPE_B )
 698         /* cavlc mb type prefix */
 699         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 700     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 701         return;
 702
 703     /* 8x8 prediction selection */
 704     if( flags & X264_ANALYSE_I8x8 )
 705     {
 706         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 707         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 708         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 709         int i_cost = 0;
 710         h->mb.i_cbp_luma = 0;
 711         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 712
 713         // FIXME some bias like in i4x4?
 714         if( h->sh.i_type == SLICE_TYPE_B )
 715             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 716
 717         for( idx = 0;; idx++ )
 718         {
 719             int x = idx&1;
 720             int y = idx>>1;
 721             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 722             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 723             int i_best = COST_MAX;
 724             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 725
 726             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 727             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 728
 729             if( b_merged_satd && predict_mode[8] >= 0 )
 730             {
 731                 int satd[9];
 732                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 733                 satd[i_pred_mode] -= 3 * a->i_lambda;
 734                 for( i=2; i>=0; i-- )
 735                 {
 736                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 737                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 738                 }
 739                 predict_mode += 3;
 740             }
 741
 742             for( ; *predict_mode >= 0; predict_mode++ )
 743             {
 744                 int i_satd;
 745                 int i_mode = *predict_mode;
 746
 747                 if( h->mb.b_lossless )
 748                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 749                 else
 750                     h->predict_8x8[i_mode]( p_dst_by, edge );
 751
 752                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
 753                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 754                     i_satd -= a->i_lambda * 3;
 755
 756                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 757                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 758             }
 759             i_cost += i_best;
 760
 761             if( idx == 3 || i_cost > i_satd_thresh )
 762                 break;
 763
 764             /* we need to encode this block now (for next ones) */
 765             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 766             x264_mb_encode_i8x8( h, idx, a->i_qp );
 767
 768             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 769         }
 770
 771         if( idx == 3 )
 772         {
 773             a->i_satd_i8x8 = i_cost;
 774             if( h->mb.i_skip_intra )
 775             {
 776                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 777                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 778                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 779                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 780                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 781                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 782                 if( h->mb.i_skip_intra == 2 )
 783                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 784             }
 785         }
 786         else
 787         {
 788             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 789             a->i_satd_i8x8 = COST_MAX;
 790             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 791         }
 792         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 793             return;
 794     }
 795
 796     /* 4x4 prediction selection */
 797     if( flags & X264_ANALYSE_I4x4 )
 798     {
 799         int i_cost;
 800         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 801         h->mb.i_cbp_luma = 0;
 802         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 803         if( a->i_mbrd )
 804             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 805
 806         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 807         if( h->sh.i_type == SLICE_TYPE_B )
 808             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 809
 810         for( idx = 0;; idx++ )
 811         {
 812             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 813             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 814             int i_best = COST_MAX;
 815             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 816
 817             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 818
 819             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 820                 /* emulate missing topright samples */
 821                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 822
 823             if( b_merged_satd && predict_mode[5] >= 0 )
 824             {
 825                 int satd[9];
 826                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 827                 satd[i_pred_mode] -= 3 * a->i_lambda;
 828                 for( i=2; i>=0; i-- )
 829                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 830                 predict_mode += 3;
 831             }
 832
 833             for( ; *predict_mode >= 0; predict_mode++ )
 834             {
 835                 int i_satd;
 836                 int i_mode = *predict_mode;
 837
 838                 if( h->mb.b_lossless )
 839                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 840                 else
 841                     h->predict_4x4[i_mode]( p_dst_by );
 842
 843                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 844                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 845                     i_satd -= a->i_lambda * 3;
 846
 847                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 848             }
 849             i_cost += i_best + 4 * a->i_lambda;
 850
 851             if( i_cost > i_satd_thresh || idx == 15 )
 852                 break;
 853
 854             /* we need to encode this block now (for next ones) */
 855             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 856             x264_mb_encode_i4x4( h, idx, a->i_qp );
 857
 858             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 859         }
 860         if( idx == 15 )
 861         {
 862             a->i_satd_i4x4 = i_cost;
 863             if( h->mb.i_skip_intra )
 864             {
 865                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 866                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 867                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 868                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 869                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 870                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 871                 if( h->mb.i_skip_intra == 2 )
 872                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 873             }
 874         }
 875         else
 876             a->i_satd_i4x4 = COST_MAX;
 877     }
 878 }
 879
 880 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 881 {
 882     if( a->i_satd_i16x16 <= i_satd_thresh )
 883     {
 884         h->mb.i_type = I_16x16;
 885         x264_analyse_update_cache( h, a );
 886         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 887     }
 888     else
 889         a->i_satd_i16x16 = COST_MAX;
 890
 891     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 892     {
 893         h->mb.i_type = I_4x4;
 894         x264_analyse_update_cache( h, a );
 895         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 896     }
 897     else
 898         a->i_satd_i4x4 = COST_MAX;
 899
 900     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 901     {
 902         h->mb.i_type = I_8x8;
 903         x264_analyse_update_cache( h, a );
 904         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 905         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 906     }
 907     else
 908         a->i_satd_i8x8 = COST_MAX;
 909 }
 910
 911 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 912 {
 913     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 914
 915     int i, idx, x, y;
 916     int i_mode, i_thresh;
 917     uint64_t i_satd, i_best;
 918     h->mb.i_skip_intra = 0;
 919
 920     if( h->mb.i_type == I_16x16 )
 921     {
 922         int old_pred_mode = a->i_predict16x16;
 923         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 924         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 925         i_best = a->i_satd_i16x16;
 926         for( ; *predict_mode >= 0; predict_mode++ )
 927         {
 928             int i_mode = *predict_mode;
 929             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 930                 continue;
 931             h->mb.i_intra16x16_pred_mode = i_mode;
 932             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 933             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 934         }
 935     }
 936
 937     /* RD selection for chroma prediction */
 938     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 939     if( predict_mode[1] >= 0 )
 940     {
 941         int8_t predict_mode_sorted[4];
 942         int i_max;
 943         i_thresh = a->i_satd_i8x8chroma * 5/4;
 944
 945         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 946         {
 947             i_mode = *predict_mode;
 948             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 949                 predict_mode_sorted[i_max++] = i_mode;
 950         }
 951
 952         if( i_max > 0 )
 953         {
 954             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 955             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 956             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 957              * coefs for the current chroma mode are still around, so we only
 958              * have to recount the bits. */
 959             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 960             for( i = 0; i < i_max; i++ )
 961             {
 962                 i_mode = predict_mode_sorted[i];
 963                 if( h->mb.b_lossless )
 964                     x264_predict_lossless_8x8_chroma( h, i_mode );
 965                 else
 966                 {
 967                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 968                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 969                 }
 970                 /* if we've already found a mode that needs no residual, then
 971                  * probably any mode with a residual will be worse.
 972                  * so avoid dct on the remaining modes to improve speed. */
 973                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 974                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 975             }
 976             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 977             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 978         }
 979     }
 980
 981     if( h->mb.i_type == I_4x4 )
 982     {
 983         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
 984         int i_nnz = 0;
 985         for( idx = 0; idx < 16; idx++ )
 986         {
 987             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 988             i_best = COST_MAX64;
 989
 990             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 991
 992             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 993                 /* emulate missing topright samples */
 994                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 995
 996             for( ; *predict_mode >= 0; predict_mode++ )
 997             {
 998                 i_mode = *predict_mode;
 999                 if( h->mb.b_lossless )
1000                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1001                 else
1002                     h->predict_4x4[i_mode]( p_dst_by );
1003                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1004
1005                 if( i_best > i_satd )
1006                 {
1007                     a->i_predict4x4[idx] = i_mode;
1008                     i_best = i_satd;
1009                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1010                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1011                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1012                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1013                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1014                 }
1015             }
1016
1017             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1018             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1019             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1020             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1021             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1022
1023             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1024         }
1025     }
1026     else if( h->mb.i_type == I_8x8 )
1027     {
1028         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1029         for( idx = 0; idx < 4; idx++ )
1030         {
1031             uint64_t pels_h = 0;
1032             uint8_t pels_v[7];
1033             uint16_t i_nnz[2] = {0}; //shut up gcc
1034             uint8_t *p_dst_by;
1035             int j;
1036             int cbp_luma_new = 0;
1037             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1038
1039             i_best = COST_MAX64;
1040             x = idx&1;
1041             y = idx>>1;
1042
1043             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1044             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1045             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1046
1047             for( ; *predict_mode >= 0; predict_mode++ )
1048             {
1049                 i_mode = *predict_mode;
1050                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1051                     continue;
1052
1053                 if( h->mb.b_lossless )
1054                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1055                 else
1056                     h->predict_8x8[i_mode]( p_dst_by, edge );
1057                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1058                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1059
1060                 if( i_best > i_satd )
1061                 {
1062                     a->i_predict8x8[idx] = i_mode;
1063                     cbp_luma_new = h->mb.i_cbp_luma;
1064                     i_best = i_satd;
1065
1066                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1067                     if( !(idx&1) )
1068                         for( j=0; j<7; j++ )
1069                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1070                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1071                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1072                 }
1073             }
1074             a->i_cbp_i8x8_luma = cbp_luma_new;
1075             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1076             if( !(idx&1) )
1077                 for( j=0; j<7; j++ )
1078                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1079             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1080             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1081
1082             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1083         }
1084     }
1085 }
1086
1087 #define LOAD_FENC( m, src, xoff, yoff) \
1088     (m)->p_cost_mv = a->p_cost_mv; \
1089     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1090     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1091     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1092     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1093     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1094
1095 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1096     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1097     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1098     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1099     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1100     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1101     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1102     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1103     (m)->weight = weight_none; \
1104     (m)->i_ref = ref;
1105
1106 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1107     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1108     (m)->weight = h->sh.weight[i_ref];
1109
1110 #define REF_COST(list, ref) \
1111     (a->p_cost_ref[list][ref])
1112
1113 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1114 {
1115     x264_me_t m;
1116     int i_ref, i_mvc;
1117     ALIGNED_4( int16_t mvc[8][2] );
1118     int i_halfpel_thresh = INT_MAX;
1119     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1120
1121     /* 16x16 Search on all ref frame */
1122     m.i_pixel = PIXEL_16x16;
1123     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1124
1125     a->l0.me16x16.cost = INT_MAX;
1126     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1127     {
1128         const int i_ref_cost = REF_COST( 0, i_ref );
1129         i_halfpel_thresh -= i_ref_cost;
1130         m.i_ref_cost = i_ref_cost;
1131
1132         /* search with ref */
1133         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1134         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1135
1136         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1137         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1138
1139         if( h->mb.ref_blind_dupe == i_ref )
1140         {
1141             CP32( m.mv, a->l0.mvc[0][0] );
1142             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1143         }
1144         else
1145             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1146
1147         /* early termination
1148          * SSD threshold would probably be better than SATD */
1149         if( i_ref == 0
1150             && a->b_try_pskip
1151             && m.cost-m.cost_mv < 300*a->i_lambda
1152             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1153               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1154             && x264_macroblock_probe_pskip( h ) )
1155         {
1156             h->mb.i_type = P_SKIP;
1157             x264_analyse_update_cache( h, a );
1158             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1159             return;
1160         }
1161
1162         m.cost += i_ref_cost;
1163         i_halfpel_thresh += i_ref_cost;
1164
1165         if( m.cost < a->l0.me16x16.cost )
1166             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1167
1168         /* save mv for predicting neighbors */
1169         CP32( a->l0.mvc[i_ref][0], m.mv );
1170         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1171     }
1172
1173     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1174     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1175
1176     h->mb.i_type = P_L0;
1177     if( a->i_mbrd )
1178     {
1179         x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1180         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1181         {
1182             h->mb.i_partition = D_16x16;
1183             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1184             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1185             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1186                 h->mb.i_type = P_SKIP;
1187         }
1188     }
1189 }
1190
1191 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1192 {
1193     x264_me_t m;
1194     int i_ref;
1195     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1196     int i_halfpel_thresh = INT_MAX;
1197     int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1198     int i;
1199     int i_maxref = h->mb.pic.i_fref[0]-1;
1200
1201     h->mb.i_partition = D_8x8;
1202
1203     #define CHECK_NEIGHBOUR(i)\
1204     {\
1205         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1206         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1207             i_maxref = ref;\
1208     }
1209
1210     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1211      * than those used by the neighbors */
1212     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1213         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1214     {
1215         i_maxref = 0;
1216         CHECK_NEIGHBOUR(  -8 - 1 );
1217         CHECK_NEIGHBOUR(  -8 + 0 );
1218         CHECK_NEIGHBOUR(  -8 + 2 );
1219         CHECK_NEIGHBOUR(  -8 + 4 );
1220         CHECK_NEIGHBOUR(   0 - 1 );
1221         CHECK_NEIGHBOUR( 2*8 - 1 );
1222     }
1223
1224     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1225         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1226
1227     for( i = 0; i < 4; i++ )
1228     {
1229         x264_me_t *l0m = &a->l0.me8x8[i];
1230         const int x8 = i%2;
1231         const int y8 = i/2;
1232
1233         m.i_pixel = PIXEL_8x8;
1234
1235         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1236         l0m->cost = INT_MAX;
1237         for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1238         {
1239             const int i_ref_cost = REF_COST( 0, i_ref );
1240             m.i_ref_cost = i_ref_cost;
1241
1242             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1243             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1244
1245             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1246             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1247             if( h->mb.ref_blind_dupe == i_ref )
1248             {
1249                 CP32( m.mv, a->l0.mvc[0][i+1] );
1250                 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1251             }
1252             else
1253                 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1254
1255             m.cost += i_ref_cost;
1256             i_halfpel_thresh += i_ref_cost;
1257             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1258
1259             if( m.cost < l0m->cost )
1260                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1261             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1262                 i_ref = h->mb.ref_blind_dupe;
1263             else
1264                 i_ref++;
1265         }
1266         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1267         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1268
1269         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1270            are effectively zero. */
1271         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1272             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1273     }
1274
1275     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1276                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1277     /* P_8x8 ref0 has no ref cost */
1278     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1279                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1280         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1281     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1282     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1283 }
1284
1285 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1286 {
1287     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1288      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1289      * don't bother analysing the dupes. */
1290     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1291     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1292     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1293     int i_mvc;
1294     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1295     int i;
1296
1297     /* XXX Needed for x264_mb_predict_mv */
1298     h->mb.i_partition = D_8x8;
1299
1300     i_mvc = 1;
1301     CP32( mvc[0], a->l0.me16x16.mv );
1302
1303     for( i = 0; i < 4; i++ )
1304     {
1305         x264_me_t *m = &a->l0.me8x8[i];
1306         const int x8 = i%2;
1307         const int y8 = i/2;
1308
1309         m->i_pixel = PIXEL_8x8;
1310         m->i_ref_cost = i_ref_cost;
1311
1312         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1313         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1314         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1315
1316         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1317         x264_me_search( h, m, mvc, i_mvc );
1318
1319         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1320
1321         CP32( mvc[i_mvc], m->mv );
1322         i_mvc++;
1323
1324         /* mb type cost */
1325         m->cost += i_ref_cost;
1326         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1327             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1328     }
1329
1330     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1331                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1332     /* theoretically this should include 4*ref_cost,
1333      * but 3 seems a better approximation of cabac. */
1334     if( h->param.b_cabac )
1335         a->l0.i_cost8x8 -= i_ref_cost;
1336     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1337     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1338 }
1339
1340 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1341 {
1342     x264_me_t m;
1343     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1344     ALIGNED_4( int16_t mvc[3][2] );
1345     int i, j;
1346
1347     /* XXX Needed for x264_mb_predict_mv */
1348     h->mb.i_partition = D_16x8;
1349
1350     for( i = 0; i < 2; i++ )
1351     {
1352         x264_me_t *l0m = &a->l0.me16x8[i];
1353         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1354         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1355         const int ref8[2] = { minref, maxref };
1356         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1357
1358         m.i_pixel = PIXEL_16x8;
1359
1360         LOAD_FENC( &m, p_fenc, 0, 8*i );
1361         l0m->cost = INT_MAX;
1362         for( j = 0; j < i_ref8s; j++ )
1363         {
1364             const int i_ref = ref8[j];
1365             const int i_ref_cost = REF_COST( 0, i_ref );
1366             m.i_ref_cost = i_ref_cost;
1367
1368             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1369             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1370             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1371             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1372
1373             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1374             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1375
1376             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1377             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1378             /* We can only take this shortcut if the first search was performed on ref0. */
1379             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1380             {
1381                 /* We can just leave the MV from the previous ref search. */
1382                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1383             }
1384             else
1385                 x264_me_search( h, &m, mvc, 3 );
1386
1387             m.cost += i_ref_cost;
1388
1389             if( m.cost < l0m->cost )
1390                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1391         }
1392         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1393         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1394     }
1395
1396     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1397 }
1398
1399 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1400 {
1401     x264_me_t m;
1402     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1403     ALIGNED_4( int16_t mvc[3][2] );
1404     int i, j;
1405
1406     /* XXX Needed for x264_mb_predict_mv */
1407     h->mb.i_partition = D_8x16;
1408
1409     for( i = 0; i < 2; i++ )
1410     {
1411         x264_me_t *l0m = &a->l0.me8x16[i];
1412         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1413         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1414         const int ref8[2] = { minref, maxref };
1415         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1416
1417         m.i_pixel = PIXEL_8x16;
1418
1419         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1420         l0m->cost = INT_MAX;
1421         for( j = 0; j < i_ref8s; j++ )
1422         {
1423             const int i_ref = ref8[j];
1424             const int i_ref_cost = REF_COST( 0, i_ref );
1425             m.i_ref_cost = i_ref_cost;
1426
1427             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1428             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1429             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1430
1431             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1432             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1433
1434             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1435             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1436             /* We can only take this shortcut if the first search was performed on ref0. */
1437             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1438             {
1439                 /* We can just leave the MV from the previous ref search. */
1440                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1441             }
1442             else
1443                 x264_me_search( h, &m, mvc, 3 );
1444
1445             m.cost += i_ref_cost;
1446
1447             if( m.cost < l0m->cost )
1448                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1449         }
1450         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1451         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1452     }
1453
1454     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1455 }
1456
1457 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1458 {
1459     ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1460     uint8_t *pix2 = pix1+8;
1461     const int i_stride = h->mb.pic.i_stride[1];
1462     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1463     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1464     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1465     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1466     x264_weight_t *weight = h->sh.weight[i_ref];
1467
1468 #define CHROMA4x4MC( width, height, me, x, y ) \
1469     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1470     if( weight[1].weightfn ) \
1471         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1472     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1473     if( weight[2].weightfn ) \
1474         weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1475
1476
1477     if( pixel == PIXEL_4x4 )
1478     {
1479         x264_me_t *m = a->l0.me4x4[i8x8];
1480         CHROMA4x4MC( 2,2, m[0], 0,0 );
1481         CHROMA4x4MC( 2,2, m[1], 2,0 );
1482         CHROMA4x4MC( 2,2, m[2], 0,2 );
1483         CHROMA4x4MC( 2,2, m[3], 2,2 );
1484     }
1485     else if( pixel == PIXEL_8x4 )
1486     {
1487         x264_me_t *m = a->l0.me8x4[i8x8];
1488         CHROMA4x4MC( 4,2, m[0], 0,0 );
1489         CHROMA4x4MC( 4,2, m[1], 0,2 );
1490     }
1491     else
1492     {
1493         x264_me_t *m = a->l0.me4x8[i8x8];
1494         CHROMA4x4MC( 2,4, m[0], 0,0 );
1495         CHROMA4x4MC( 2,4, m[1], 2,0 );
1496     }
1497
1498     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1499          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1500 }
1501
1502 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1503 {
1504     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1505     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1506     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1507     int i4x4;
1508
1509     /* XXX Needed for x264_mb_predict_mv */
1510     h->mb.i_partition = D_8x8;
1511
1512     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1513     {
1514         const int idx = 4*i8x8 + i4x4;
1515         const int x4 = block_idx_x[idx];
1516         const int y4 = block_idx_y[idx];
1517         const int i_mvc = (i4x4 == 0);
1518
1519         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1520
1521         m->i_pixel = PIXEL_4x4;
1522
1523         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1524         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1525         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1526
1527         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1528         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1529
1530         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1531     }
1532     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1533                             a->l0.me4x4[i8x8][1].cost +
1534                             a->l0.me4x4[i8x8][2].cost +
1535                             a->l0.me4x4[i8x8][3].cost +
1536                             REF_COST( 0, i_ref ) +
1537                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1538     if( h->mb.b_chroma_me )
1539         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1540 }
1541
1542 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1543 {
1544     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1545     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1546     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1547     int i8x4;
1548
1549     /* XXX Needed for x264_mb_predict_mv */
1550     h->mb.i_partition = D_8x8;
1551
1552     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1553     {
1554         const int idx = 4*i8x8 + 2*i8x4;
1555         const int x4 = block_idx_x[idx];
1556         const int y4 = block_idx_y[idx];
1557         const int i_mvc = (i8x4 == 0);
1558
1559         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1560
1561         m->i_pixel = PIXEL_8x4;
1562
1563         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1564         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1565         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1566
1567         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1568         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1569
1570         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1571     }
1572     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1573                             REF_COST( 0, i_ref ) +
1574                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1575     if( h->mb.b_chroma_me )
1576         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1577 }
1578
1579 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1580 {
1581     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1582     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1583     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1584     int i4x8;
1585
1586     /* XXX Needed for x264_mb_predict_mv */
1587     h->mb.i_partition = D_8x8;
1588
1589     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1590     {
1591         const int idx = 4*i8x8 + i4x8;
1592         const int x4 = block_idx_x[idx];
1593         const int y4 = block_idx_y[idx];
1594         const int i_mvc = (i4x8 == 0);
1595
1596         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1597
1598         m->i_pixel = PIXEL_4x8;
1599
1600         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1601         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1602         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1603
1604         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1605         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1606
1607         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1608     }
1609     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1610                             REF_COST( 0, i_ref ) +
1611                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1612     if( h->mb.b_chroma_me )
1613         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1614 }
1615
1616 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1617 {
1618     /* Assumes that fdec still contains the results of
1619      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1620
1621     uint8_t **p_fenc = h->mb.pic.p_fenc;
1622     uint8_t **p_fdec = h->mb.pic.p_fdec;
1623     int i;
1624
1625     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1626     for( i = 0; i < 4; i++ )
1627     {
1628         const int x = (i&1)*8;
1629         const int y = (i>>1)*8;
1630         a->i_cost16x16direct +=
1631         a->i_cost8x8direct[i] =
1632             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1633
1634         /* mb type cost */
1635         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1636     }
1637 }
1638
1639 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1640 {
1641     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1642     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1643     uint8_t *src0, *src1;
1644     int stride0 = 16, stride1 = 16;
1645
1646     x264_me_t m;
1647     int i_ref, i_mvc;
1648     ALIGNED_4( int16_t mvc[9][2] );
1649     int i_halfpel_thresh = INT_MAX;
1650     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1651
1652     /* 16x16 Search on all ref frame */
1653     m.i_pixel = PIXEL_16x16;
1654     m.weight = weight_none;
1655
1656     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1657
1658     /* ME for List 0 */
1659     a->l0.me16x16.cost = INT_MAX;
1660     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1661     {
1662         const int i_ref_cost = REF_COST( 0, i_ref );
1663         m.i_ref_cost = i_ref_cost;
1664         /* search with ref */
1665         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1666         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1667         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1668         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1669
1670         /* add ref cost */
1671         m.cost += i_ref_cost;
1672
1673         if( m.cost < a->l0.me16x16.cost )
1674         {
1675             a->l0.i_ref = i_ref;
1676             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1677         }
1678
1679         /* save mv for predicting neighbors */
1680         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1681     }
1682     a->l0.me16x16.i_ref = a->l0.i_ref;
1683
1684     /* ME for list 1 */
1685     i_halfpel_thresh = INT_MAX;
1686     p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1687     a->l1.me16x16.cost = INT_MAX;
1688     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1689     {
1690         const int i_ref_cost = REF_COST( 0, i_ref );
1691         m.i_ref_cost = i_ref_cost;
1692         /* search with ref */
1693         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1694         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1695         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1696         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1697
1698         /* add ref cost */
1699         m.cost += i_ref_cost;
1700
1701         if( m.cost < a->l1.me16x16.cost )
1702         {
1703             a->l1.i_ref = i_ref;
1704             h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1705         }
1706
1707         /* save mv for predicting neighbors */
1708         CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1709     }
1710     a->l1.me16x16.i_ref = a->l1.i_ref;
1711
1712     /* get cost of BI mode */
1713     int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
1714     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1715     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1716     src0 = h->mc.get_ref( pix0, &stride0,
1717                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1718                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1719     src1 = h->mc.get_ref( pix1, &stride1,
1720                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1721                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1722
1723     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1724
1725     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1726                      + ref_costs
1727                      + a->l0.bi16x16.cost_mv
1728                      + a->l1.bi16x16.cost_mv;
1729
1730
1731     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1732     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1733     {
1734         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1735                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1736         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1737                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1738         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
1739                                 h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
1740                                 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1741         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1742                    + ref_costs + l0_mv_cost + l1_mv_cost;
1743         if( cost00 < a->i_cost16x16bi )
1744         {
1745             M32( a->l0.bi16x16.mv ) = 0;
1746             M32( a->l1.bi16x16.mv ) = 0;
1747             a->l0.bi16x16.cost_mv = l0_mv_cost;
1748             a->l1.bi16x16.cost_mv = l1_mv_cost;
1749             a->i_cost16x16bi = cost00;
1750         }
1751     }
1752
1753     /* mb type cost */
1754     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1755     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1756     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1757 }
1758
1759 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1760 {
1761     const int x = 2*(i%2);
1762     const int y = 2*(i/2);
1763
1764     switch( h->mb.i_sub_partition[i] )
1765     {
1766         case D_L0_8x8:
1767             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1768             break;
1769         case D_L0_8x4:
1770             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1771             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1772             break;
1773         case D_L0_4x8:
1774             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1775             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1776             break;
1777         case D_L0_4x4:
1778             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1779             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1780             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1781             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1782             break;
1783         default:
1784             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1785             break;
1786     }
1787 }
1788
1789 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1790 {
1791     const int x = 2*(idx&1);
1792     const int y = 2*(idx>>1);
1793     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1794     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1795     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1796     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1797 }
1798
1799 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1800     if( x264_mb_partition_listX_table[0][part] ) \
1801     { \
1802         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1803         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1804     } \
1805     else \
1806     { \
1807         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1808         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1809         if( b_mvd ) \
1810             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1811     } \
1812     if( x264_mb_partition_listX_table[1][part] ) \
1813     { \
1814         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1815         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1816     } \
1817     else \
1818     { \
1819         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1820         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1821         if( b_mvd ) \
1822             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1823     }
1824
1825 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1826 {
1827     int x = (i%2)*2;
1828     int y = (i/2)*2;
1829     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1830     {
1831         x264_mb_load_mv_direct8x8( h, i );
1832         if( b_mvd )
1833         {
1834             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1835             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1836             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1837         }
1838     }
1839     else
1840     {
1841         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1842     }
1843 }
1844 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1845 {
1846     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1847 }
1848 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1849 {
1850     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1851 }
1852 #undef CACHE_MV_BI
1853
1854 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1855 {
1856     uint8_t **p_fref[2] =
1857         { h->mb.pic.p_fref[0][a->l0.i_ref],
1858           h->mb.pic.p_fref[1][a->l1.i_ref] };
1859     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1860     int i, l;
1861
1862     /* XXX Needed for x264_mb_predict_mv */
1863     h->mb.i_partition = D_8x8;
1864
1865     a->i_cost8x8bi = 0;
1866
1867     for( i = 0; i < 4; i++ )
1868     {
1869         const int x8 = i%2;
1870         const int y8 = i/2;
1871         int i_part_cost;
1872         int i_part_cost_bi = 0;
1873         int stride[2] = {8,8};
1874         uint8_t *src[2];
1875
1876         for( l = 0; l < 2; l++ )
1877         {
1878             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1879             const int i_ref_cost = REF_COST( l, lX->i_ref );
1880             x264_me_t *m = &lX->me8x8[i];
1881
1882             m->i_pixel = PIXEL_8x8;
1883             m->i_ref_cost = i_ref_cost;
1884
1885             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1886             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1887
1888             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1889             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1890             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1891             m->cost += i_ref_cost;
1892
1893             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1894
1895             /* BI mode */
1896             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1897                                     m->mv[0], m->mv[1], 8, 8, weight_none );
1898             i_part_cost_bi += m->cost_mv + i_ref_cost;
1899         }
1900         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1901         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1902                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1903         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1904         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1905
1906         i_part_cost = a->l0.me8x8[i].cost;
1907         h->mb.i_sub_partition[i] = D_L0_8x8;
1908         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1909         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1910         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1911         a->i_cost8x8bi += i_part_cost;
1912
1913         /* XXX Needed for x264_mb_predict_mv */
1914         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1915     }
1916
1917     /* mb type cost */
1918     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1919 }
1920
1921 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1922 {
1923     uint8_t **p_fref[2] =
1924         { h->mb.pic.p_fref[0][a->l0.i_ref],
1925           h->mb.pic.p_fref[1][a->l1.i_ref] };
1926     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1927     ALIGNED_4( int16_t mvc[2][2] );
1928     int i, l;
1929
1930     h->mb.i_partition = D_16x8;
1931     a->i_cost16x8bi = 0;
1932
1933     for( i = 0; i < 2; i++ )
1934     {
1935         int i_part_cost;
1936         int i_part_cost_bi = 0;
1937         int stride[2] = {16,16};
1938         uint8_t *src[2];
1939
1940         /* TODO: check only the list(s) that were used in b8x8? */
1941         for( l = 0; l < 2; l++ )
1942         {
1943             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1944             const int i_ref_cost = REF_COST( l, lX->i_ref );
1945             x264_me_t *m = &lX->me16x8[i];
1946
1947             m->i_pixel = PIXEL_16x8;
1948             m->i_ref_cost = i_ref_cost;
1949
1950             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1951             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1952
1953             CP32( mvc[0], lX->me8x8[2*i].mv );
1954             CP32( mvc[1], lX->me8x8[2*i+1].mv );
1955
1956             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1957             x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1958             x264_me_search( h, m, mvc, 2 );
1959             m->cost += i_ref_cost;
1960
1961             /* BI mode */
1962             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1963                                     m->mv[0], m->mv[1], 16, 8, weight_none );
1964             i_part_cost_bi += m->cost_mv + i_ref_cost;
1965         }
1966         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1967         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1968
1969         i_part_cost = a->l0.me16x8[i].cost;
1970         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1971         if( a->l1.me16x8[i].cost < i_part_cost )
1972         {
1973             i_part_cost = a->l1.me16x8[i].cost;
1974             a->i_mb_partition16x8[i] = D_L1_8x8;
1975         }
1976         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1977         {
1978             i_part_cost = i_part_cost_bi;
1979             a->i_mb_partition16x8[i] = D_BI_8x8;
1980         }
1981         a->i_cost16x8bi += i_part_cost;
1982
1983         x264_mb_cache_mv_b16x8( h, a, i, 0 );
1984     }
1985
1986     /* mb type cost */
1987     a->i_mb_type16x8 = B_L0_L0
1988         + (a->i_mb_partition16x8[0]>>2) * 3
1989         + (a->i_mb_partition16x8[1]>>2);
1990     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1991 }
1992
1993 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1994 {
1995     uint8_t **p_fref[2] =
1996         { h->mb.pic.p_fref[0][a->l0.i_ref],
1997           h->mb.pic.p_fref[1][a->l1.i_ref] };
1998     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
1999     ALIGNED_4( int16_t mvc[2][2] );
2000     int i, l;
2001
2002     h->mb.i_partition = D_8x16;
2003     a->i_cost8x16bi = 0;
2004
2005     for( i = 0; i < 2; i++ )
2006     {
2007         int i_part_cost;
2008         int i_part_cost_bi = 0;
2009         int stride[2] = {8,8};
2010         uint8_t *src[2];
2011
2012         for( l = 0; l < 2; l++ )
2013         {
2014             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2015             const int i_ref_cost = REF_COST( l, lX->i_ref );
2016             x264_me_t *m = &lX->me8x16[i];
2017
2018             m->i_pixel = PIXEL_8x16;
2019             m->i_ref_cost = i_ref_cost;
2020
2021             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2022             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2023
2024             CP32( mvc[0], lX->me8x8[i].mv );
2025             CP32( mvc[1], lX->me8x8[i+2].mv );
2026
2027             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2028             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2029             x264_me_search( h, m, mvc, 2 );
2030             m->cost += i_ref_cost;
2031
2032             /* BI mode */
2033             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
2034                                     m->mv[0], m->mv[1], 8, 16, weight_none );
2035             i_part_cost_bi += m->cost_mv + i_ref_cost;
2036         }
2037
2038         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2039         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2040
2041         i_part_cost = a->l0.me8x16[i].cost;
2042         a->i_mb_partition8x16[i] = D_L0_8x8;
2043         if( a->l1.me8x16[i].cost < i_part_cost )
2044         {
2045             i_part_cost = a->l1.me8x16[i].cost;
2046             a->i_mb_partition8x16[i] = D_L1_8x8;
2047         }
2048         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2049         {
2050             i_part_cost = i_part_cost_bi;
2051             a->i_mb_partition8x16[i] = D_BI_8x8;
2052         }
2053         a->i_cost8x16bi += i_part_cost;
2054
2055         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2056     }
2057
2058     /* mb type cost */
2059     a->i_mb_type8x16 = B_L0_L0
2060         + (a->i_mb_partition8x16[0]>>2) * 3
2061         + (a->i_mb_partition8x16[1]>>2);
2062     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2063 }
2064
2065 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2066 {
2067     int thresh = i_satd * 5/4;
2068
2069     h->mb.i_type = P_L0;
2070     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2071     {
2072         h->mb.i_partition = D_16x16;
2073         x264_analyse_update_cache( h, a );
2074         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2075     }
2076
2077     if( a->l0.i_cost16x8 <= thresh )
2078     {
2079         h->mb.i_partition = D_16x8;
2080         x264_analyse_update_cache( h, a );
2081         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2082     }
2083     else
2084         a->l0.i_cost16x8 = COST_MAX;
2085
2086     if( a->l0.i_cost8x16 <= thresh )
2087     {
2088         h->mb.i_partition = D_8x16;
2089         x264_analyse_update_cache( h, a );
2090         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2091     }
2092     else
2093         a->l0.i_cost8x16 = COST_MAX;
2094
2095     if( a->l0.i_cost8x8 <= thresh )
2096     {
2097         h->mb.i_type = P_8x8;
2098         h->mb.i_partition = D_8x8;
2099         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2100         {
2101             int i;
2102             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2103             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2104             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2105             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2106             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2107              * for future blocks are those left over from previous RDO calls. */
2108             for( i = 0; i < 4; i++ )
2109             {
2110                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2111                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2112                 int subtype, btype = D_L0_8x8;
2113                 uint64_t bcost = COST_MAX64;
2114                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2115                 {
2116                     uint64_t cost;
2117                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2118                         continue;
2119                     h->mb.i_sub_partition[i] = subtype;
2120                     x264_mb_cache_mv_p8x8( h, a, i );
2121                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2122                     COPY2_IF_LT( bcost, cost, btype, subtype );
2123                 }
2124                 if( h->mb.i_sub_partition[i] != btype )
2125                 {
2126                     h->mb.i_sub_partition[i] = btype;
2127                     x264_mb_cache_mv_p8x8( h, a, i );
2128                 }
2129             }
2130         }
2131         else
2132             x264_analyse_update_cache( h, a );
2133         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2134     }
2135     else
2136         a->l0.i_cost8x8 = COST_MAX;
2137 }
2138
2139 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2140 {
2141     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2142
2143     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2144     {
2145         h->mb.i_type = B_DIRECT;
2146         /* Assumes direct/skip MC is still in fdec */
2147         /* Requires b-rdo to be done before intra analysis */
2148         h->mb.b_skip_mc = 1;
2149         x264_analyse_update_cache( h, a );
2150         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2151         h->mb.b_skip_mc = 0;
2152     }
2153
2154     //FIXME not all the update_cache calls are needed
2155     h->mb.i_partition = D_16x16;
2156     /* L0 */
2157     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2158     {
2159         h->mb.i_type = B_L0_L0;
2160         x264_analyse_update_cache( h, a );
2161         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2162     }
2163
2164     /* L1 */
2165     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2166     {
2167         h->mb.i_type = B_L1_L1;
2168         x264_analyse_update_cache( h, a );
2169         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2170     }
2171
2172     /* BI */
2173     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2174     {
2175         h->mb.i_type = B_BI_BI;
2176         x264_analyse_update_cache( h, a );
2177         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2178     }
2179
2180     /* 8x8 */
2181     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2182     {
2183         h->mb.i_type = B_8x8;
2184         h->mb.i_partition = D_8x8;
2185         x264_analyse_update_cache( h, a );
2186         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2187         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2188     }
2189
2190     /* 16x8 */
2191     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2192     {
2193         h->mb.i_type = a->i_mb_type16x8;
2194         h->mb.i_partition = D_16x8;
2195         x264_analyse_update_cache( h, a );
2196         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2197     }
2198
2199     /* 8x16 */
2200     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2201     {
2202         h->mb.i_type = a->i_mb_type8x16;
2203         h->mb.i_partition = D_8x16;
2204         x264_analyse_update_cache( h, a );
2205         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2206     }
2207 }
2208
2209 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2210 {
2211     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2212     int i;
2213
2214     if( IS_INTRA(h->mb.i_type) )
2215         return;
2216
2217     switch( h->mb.i_partition )
2218     {
2219         case D_16x16:
2220             if( h->mb.i_type == B_BI_BI )
2221                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2222             break;
2223         case D_16x8:
2224             for( i=0; i<2; i++ )
2225                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2226                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2227             break;
2228         case D_8x16:
2229             for( i=0; i<2; i++ )
2230                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2231                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2232             break;
2233         case D_8x8:
2234             for( i=0; i<4; i++ )
2235                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2236                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2237             break;
2238     }
2239 }
2240
2241 static inline void x264_mb_analyse_transform( x264_t *h )
2242 {
2243     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2244     {
2245         int i_cost4, i_cost8;
2246         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2247         x264_mb_mc( h );
2248
2249         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2250                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2251         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2252                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2253
2254         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2255         h->mb.b_skip_mc = 1;
2256     }
2257 }
2258
2259 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2260 {
2261     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2262     {
2263         int i_rd8;
2264         x264_analyse_update_cache( h, a );
2265         h->mb.b_transform_8x8 ^= 1;
2266         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2267         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2268
2269         if( *i_rd >= i_rd8 )
2270         {
2271             if( *i_rd > 0 )
2272                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2273             *i_rd = i_rd8;
2274         }
2275         else
2276             h->mb.b_transform_8x8 ^= 1;
2277     }
2278 }
2279
2280 /* Rate-distortion optimal QP selection.
2281  * FIXME: More than half of the benefit of this function seems to be
2282  * in the way it improves the coding of chroma DC (by decimating or
2283  * finding a better way to code a single DC coefficient.)
2284  * There must be a more efficient way to get that portion of the benefit
2285  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2286  * trick. */
2287 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2288 {
2289     int bcost, cost, direction, failures, prevcost, origcost;
2290     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2291     int last_qp_tried = 0;
2292     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2293     int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2294
2295     /* If CBP is already zero, don't raise the quantizer any higher. */
2296     for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2297     {
2298         /* Without psy-RD, require monotonicity when moving quant away from previous
2299          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2300          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2301          * allow 2 failures when moving quant towards previous quant.
2302          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2303         int threshold = (!!h->mb.i_psy_rd);
2304         /* Raise the threshold for failures if we're moving towards the last QP. */
2305         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2306             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2307             threshold++;
2308         h->mb.i_qp = orig_qp;
2309         failures = 0;
2310         prevcost = origcost;
2311
2312         /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2313          * (up to a point) will too.  So, jump down to where the threshold will kick in
2314          * and check the QP there.  If the CBP is still empty, skip the main loop.
2315          * If it isn't empty, we would have ended up having to check this QP anyways,
2316          * so as long as we store it for later lookup, we lose nothing. */
2317         int already_checked_qp = -1;
2318         int already_checked_cost = COST_MAX;
2319         if( direction == -1 )
2320         {
2321             if( !origcbp )
2322             {
2323                 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2324                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2325                 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2326                 if( !h->mb.cbp[h->mb.i_mb_xy] )
2327                 {
2328                     /* If our empty-CBP block is lower QP than the last QP,
2329                      * the last QP almost surely doesn't have a CBP either. */
2330                     if( h->mb.i_last_qp > h->mb.i_qp )
2331                         last_qp_tried = 1;
2332                     break;
2333                 }
2334                 already_checked_qp = h->mb.i_qp;
2335                 h->mb.i_qp = orig_qp;
2336             }
2337         }
2338
2339         h->mb.i_qp += direction;
2340         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2341         {
2342             if( h->mb.i_last_qp == h->mb.i_qp )
2343                 last_qp_tried = 1;
2344             if( h->mb.i_qp == already_checked_qp )
2345                 cost = already_checked_cost;
2346             else
2347             {
2348                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2349                 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2350                 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2351             }
2352
2353             /* We can't assume that the costs are monotonic over QPs.
2354              * Tie case-as-failure seems to give better results. */
2355             if( cost < prevcost )
2356                 failures = 0;
2357             else
2358                 failures++;
2359             prevcost = cost;
2360
2361             if( failures > threshold )
2362                 break;
2363             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2364                 break;
2365             h->mb.i_qp += direction;
2366         }
2367     }
2368
2369     /* Always try the last block's QP. */
2370     if( !last_qp_tried )
2371     {
2372         h->mb.i_qp = h->mb.i_last_qp;
2373         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2374         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2375         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2376     }
2377
2378     h->mb.i_qp = bqp;
2379     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2380
2381     /* Check transform again; decision from before may no longer be optimal. */
2382     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2383         x264_mb_transform_8x8_allowed( h ) )
2384     {
2385         h->mb.b_transform_8x8 ^= 1;
2386         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2387         if( cost > bcost )
2388             h->mb.b_transform_8x8 ^= 1;
2389     }
2390 }
2391
2392 /*****************************************************************************
2393  * x264_macroblock_analyse:
2394  *****************************************************************************/
2395 void x264_macroblock_analyse( x264_t *h )
2396 {
2397     x264_mb_analysis_t analysis;
2398     int i_cost = COST_MAX;
2399     int i;
2400
2401     h->mb.i_qp = x264_ratecontrol_qp( h );
2402     if( h->param.rc.i_aq_mode )
2403     {
2404         x264_adaptive_quant( h );
2405         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2406          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2407         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2408             h->mb.i_qp = h->mb.i_last_qp;
2409     }
2410
2411     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2412
2413     /*--------------------------- Do the analysis ---------------------------*/
2414     if( h->sh.i_type == SLICE_TYPE_I )
2415     {
2416 intra_analysis:
2417         if( analysis.i_mbrd )
2418             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2419         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2420         if( analysis.i_mbrd )
2421             x264_intra_rd( h, &analysis, COST_MAX );
2422
2423         i_cost = analysis.i_satd_i16x16;
2424         h->mb.i_type = I_16x16;
2425         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2426         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2427         if( analysis.i_satd_pcm < i_cost )
2428             h->mb.i_type = I_PCM;
2429
2430         else if( analysis.i_mbrd >= 2 )
2431             x264_intra_rd_refine( h, &analysis );
2432     }
2433     else if( h->sh.i_type == SLICE_TYPE_P )
2434     {
2435         int b_skip = 0;
2436
2437         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2438
2439         analysis.b_try_pskip = 0;
2440         if( analysis.b_force_intra )
2441         {
2442             if( !h->param.analyse.b_psy )
2443             {
2444                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2445                 goto intra_analysis;
2446             }
2447         }
2448         else
2449         {
2450             /* Fast P_SKIP detection */
2451             if( h->param.analyse.b_fast_pskip )
2452             {
2453                 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2454                     // FIXME don't need to check this if the reference frame is done
2455                     {}
2456                 else if( h->param.analyse.i_subpel_refine >= 3 )
2457                     analysis.b_try_pskip = 1;
2458                 else if( h->mb.i_mb_type_left == P_SKIP ||
2459                          h->mb.i_mb_type_top == P_SKIP ||
2460                          h->mb.i_mb_type_topleft == P_SKIP ||
2461                          h->mb.i_mb_type_topright == P_SKIP )
2462                     b_skip = x264_macroblock_probe_pskip( h );
2463             }
2464         }
2465
2466         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2467
2468         if( b_skip )
2469         {
2470             h->mb.i_type = P_SKIP;
2471             h->mb.i_partition = D_16x16;
2472             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2473         }
2474         else
2475         {
2476             const unsigned int flags = h->param.analyse.inter;
2477             int i_type;
2478             int i_partition;
2479             int i_thresh16x8;
2480             int i_satd_inter, i_satd_intra;
2481
2482             x264_mb_analyse_load_costs( h, &analysis );
2483
2484             x264_mb_analyse_inter_p16x16( h, &analysis );
2485
2486             if( h->mb.i_type == P_SKIP )
2487                 return;
2488
2489             if( flags & X264_ANALYSE_PSUB16x16 )
2490             {
2491                 if( h->param.analyse.b_mixed_references )
2492                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2493                 else
2494                     x264_mb_analyse_inter_p8x8( h, &analysis );
2495             }
2496
2497             /* Select best inter mode */
2498             i_type = P_L0;
2499             i_partition = D_16x16;
2500             i_cost = analysis.l0.me16x16.cost;
2501
2502             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2503                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2504             {
2505                 i_type = P_8x8;
2506                 i_partition = D_8x8;
2507                 i_cost = analysis.l0.i_cost8x8;
2508
2509                 /* Do sub 8x8 */
2510                 if( flags & X264_ANALYSE_PSUB8x8 )
2511                 {
2512                     for( i = 0; i < 4; i++ )
2513                     {
2514                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2515                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2516                         {
2517                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2518                             h->mb.i_sub_partition[i] = D_L0_4x4;
2519
2520                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2521                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2522                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2523
2524                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2525                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2526                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2527
2528                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2529                         }
2530                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2531                     }
2532                     analysis.l0.i_cost8x8 = i_cost;
2533                 }
2534             }
2535
2536             /* Now do 16x8/8x16 */
2537             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2538             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2539                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2540             {
2541                 x264_mb_analyse_inter_p16x8( h, &analysis );
2542                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2543
2544                 x264_mb_analyse_inter_p8x16( h, &analysis );
2545                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2546             }
2547
2548             h->mb.i_partition = i_partition;
2549
2550             /* refine qpel */
2551             //FIXME mb_type costs?
2552             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2553             {
2554                 /* refine later */
2555             }
2556             else if( i_partition == D_16x16 )
2557             {
2558                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2559                 i_cost = analysis.l0.me16x16.cost;
2560             }
2561             else if( i_partition == D_16x8 )
2562             {
2563                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2564                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2565                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2566             }
2567             else if( i_partition == D_8x16 )
2568             {
2569                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2570                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2571                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2572             }
2573             else if( i_partition == D_8x8 )
2574             {
2575                 int i8x8;
2576                 i_cost = 0;
2577                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2578                 {
2579                     switch( h->mb.i_sub_partition[i8x8] )
2580                     {
2581                         case D_L0_8x8:
2582                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2583                             i_cost += analysis.l0.me8x8[i8x8].cost;
2584                             break;
2585                         case D_L0_8x4:
2586                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2587                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2588                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2589                                       analysis.l0.me8x4[i8x8][1].cost;
2590                             break;
2591                         case D_L0_4x8:
2592                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2593                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2594                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2595                                       analysis.l0.me4x8[i8x8][1].cost;
2596                             break;
2597
2598                         case D_L0_4x4:
2599                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2600                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2601                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2602                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2603                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2604                                       analysis.l0.me4x4[i8x8][1].cost +
2605                                       analysis.l0.me4x4[i8x8][2].cost +
2606                                       analysis.l0.me4x4[i8x8][3].cost;
2607                             break;
2608                         default:
2609                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2610                             break;
2611                     }
2612                 }
2613             }
2614
2615             if( h->mb.b_chroma_me )
2616             {
2617                 x264_mb_analyse_intra_chroma( h, &analysis );
2618                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2619                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2620                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2621                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2622             }
2623             else
2624                 x264_mb_analyse_intra( h, &analysis, i_cost );
2625
2626             i_satd_inter = i_cost;
2627             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2628                                       analysis.i_satd_i8x8,
2629                                       analysis.i_satd_i4x4 );
2630
2631             if( analysis.i_mbrd )
2632             {
2633                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2634                 i_type = P_L0;
2635                 i_partition = D_16x16;
2636                 i_cost = analysis.l0.i_rd16x16;
2637                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2638                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2639                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2640                 h->mb.i_type = i_type;
2641                 h->mb.i_partition = i_partition;
2642                 if( i_cost < COST_MAX )
2643                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2644                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2645             }
2646
2647             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2648             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2649             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2650             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2651
2652             h->mb.i_type = i_type;
2653
2654             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2655             {
2656                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2657                  * it was an inter block. */
2658                 x264_analyse_update_cache( h, &analysis );
2659                 x264_macroblock_encode( h );
2660                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2661                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2662                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2663                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2664                 goto intra_analysis;
2665             }
2666
2667             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2668             {
2669                 if( IS_INTRA( h->mb.i_type ) )
2670                 {
2671                     x264_intra_rd_refine( h, &analysis );
2672                 }
2673                 else if( i_partition == D_16x16 )
2674                 {
2675                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2676                     analysis.l0.me16x16.cost = i_cost;
2677                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2678                 }
2679                 else if( i_partition == D_16x8 )
2680                 {
2681                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2682                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2683                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2684                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2685                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2686                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2687                 }
2688                 else if( i_partition == D_8x16 )
2689                 {
2690                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2691                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2692                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2693                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2694                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2695                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2696                 }
2697                 else if( i_partition == D_8x8 )
2698                 {
2699                     int i8x8;
2700                     x264_analyse_update_cache( h, &analysis );
2701                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2702                     {
2703                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2704                         {
2705                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2706                         }
2707                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2708                         {
2709                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2710                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2711                         }
2712                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2713                         {
2714                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2715                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2716                         }
2717                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2718                         {
2719                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2720                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2721                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2722                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2723                         }
2724                     }
2725                 }
2726             }
2727         }
2728     }
2729     else if( h->sh.i_type == SLICE_TYPE_B )
2730     {
2731         int i_bskip_cost = COST_MAX;
2732         int b_skip = 0;
2733
2734         if( analysis.i_mbrd )
2735             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2736
2737         h->mb.i_type = B_SKIP;
2738         if( h->mb.b_direct_auto_write )
2739         {
2740             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2741             for( i = 0; i < 2; i++ )
2742             {
2743                 int b_changed = 1;
2744                 h->sh.b_direct_spatial_mv_pred ^= 1;
2745                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2746                 if( analysis.b_direct_available )
2747                 {
2748                     if( b_changed )
2749                     {
2750                         x264_mb_mc( h );
2751                         b_skip = x264_macroblock_probe_bskip( h );
2752                     }
2753                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2754                 }
2755                 else
2756                     b_skip = 0;
2757             }
2758         }
2759         else
2760             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2761
2762         if( analysis.b_direct_available )
2763         {
2764             if( !h->mb.b_direct_auto_write )
2765                 x264_mb_mc( h );
2766             if( analysis.i_mbrd )
2767             {
2768                 i_bskip_cost = ssd_mb( h );
2769                 /* 6 = minimum cavlc cost of a non-skipped MB */
2770                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2771             }
2772             else if( !h->mb.b_direct_auto_write )
2773             {
2774                 /* Conditioning the probe on neighboring block types
2775                  * doesn't seem to help speed or quality. */
2776                 b_skip = x264_macroblock_probe_bskip( h );
2777             }
2778         }
2779
2780         if( !b_skip )
2781         {
2782             const unsigned int flags = h->param.analyse.inter;
2783             int i_type;
2784             int i_partition;
2785             int i_satd_inter;
2786             h->mb.b_skip_mc = 0;
2787
2788             x264_mb_analyse_load_costs( h, &analysis );
2789
2790             /* select best inter mode */
2791             /* direct must be first */
2792             if( analysis.b_direct_available )
2793                 x264_mb_analyse_inter_direct( h, &analysis );
2794
2795             x264_mb_analyse_inter_b16x16( h, &analysis );
2796
2797             i_type = B_L0_L0;
2798             i_partition = D_16x16;
2799             i_cost = analysis.l0.me16x16.cost;
2800             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2801             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2802             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2803
2804             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2805             {
2806                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2807                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2808                     i_bskip_cost < analysis.i_rd16x16bi &&
2809                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2810                     i_bskip_cost < analysis.l1.i_rd16x16 )
2811                 {
2812                     h->mb.i_type = B_SKIP;
2813                     x264_analyse_update_cache( h, &analysis );
2814                     return;
2815                 }
2816             }
2817
2818             if( flags & X264_ANALYSE_BSUB16x16 )
2819             {
2820                 x264_mb_analyse_inter_b8x8( h, &analysis );
2821                 if( analysis.i_cost8x8bi < i_cost )
2822                 {
2823                     i_type = B_8x8;
2824                     i_partition = D_8x8;
2825                     i_cost = analysis.i_cost8x8bi;
2826
2827                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2828                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2829                     {
2830                         x264_mb_analyse_inter_b16x8( h, &analysis );
2831                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2832                                      i_type, analysis.i_mb_type16x8,
2833                                      i_partition, D_16x8 );
2834                     }
2835                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2836                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2837                     {
2838                         x264_mb_analyse_inter_b8x16( h, &analysis );
2839                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2840                                      i_type, analysis.i_mb_type8x16,
2841                                      i_partition, D_8x16 );
2842                     }
2843                 }
2844             }
2845
2846             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2847             {
2848                 /* refine later */
2849             }
2850             /* refine qpel */
2851             else if( i_partition == D_16x16 )
2852             {
2853                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2854                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2855                 if( i_type == B_L0_L0 )
2856                 {
2857                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2858                     i_cost = analysis.l0.me16x16.cost
2859                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2860                 }
2861                 else if( i_type == B_L1_L1 )
2862                 {
2863                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2864                     i_cost = analysis.l1.me16x16.cost
2865                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2866                 }
2867                 else if( i_type == B_BI_BI )
2868                 {
2869                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2870                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2871                 }
2872             }
2873             else if( i_partition == D_16x8 )
2874             {
2875                 for( i=0; i<2; i++ )
2876                 {
2877                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2878                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2879                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2880                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2881                 }
2882             }
2883             else if( i_partition == D_8x16 )
2884             {
2885                 for( i=0; i<2; i++ )
2886                 {
2887                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2888                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2889                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2890                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2891                 }
2892             }
2893             else if( i_partition == D_8x8 )
2894             {
2895                 for( i=0; i<4; i++ )
2896                 {
2897                     x264_me_t *m;
2898                     int i_part_cost_old;
2899                     int i_type_cost;
2900                     int i_part_type = h->mb.i_sub_partition[i];
2901                     int b_bidir = (i_part_type == D_BI_8x8);
2902
2903                     if( i_part_type == D_DIRECT_8x8 )
2904                         continue;
2905                     if( x264_mb_partition_listX_table[0][i_part_type] )
2906                     {
2907                         m = &analysis.l0.me8x8[i];
2908                         i_part_cost_old = m->cost;
2909                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2910                         m->cost -= i_type_cost;
2911                         x264_me_refine_qpel( h, m );
2912                         if( !b_bidir )
2913                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2914                     }
2915                     if( x264_mb_partition_listX_table[1][i_part_type] )
2916                     {
2917                         m = &analysis.l1.me8x8[i];
2918                         i_part_cost_old = m->cost;
2919                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2920                         m->cost -= i_type_cost;
2921                         x264_me_refine_qpel( h, m );
2922                         if( !b_bidir )
2923                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2924                     }
2925                     /* TODO: update mvp? */
2926                 }
2927             }
2928
2929             i_satd_inter = i_cost;
2930
2931             if( analysis.i_mbrd )
2932             {
2933                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2934                 i_type = B_SKIP;
2935                 i_cost = i_bskip_cost;
2936                 i_partition = D_16x16;
2937                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2938                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2939                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2940                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2941                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2942                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2943                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2944
2945                 h->mb.i_type = i_type;
2946                 h->mb.i_partition = i_partition;
2947             }
2948
2949             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2950
2951             if( analysis.i_mbrd )
2952             {
2953                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2954                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2955             }
2956
2957             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2958             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2959             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2960             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2961
2962             h->mb.i_type = i_type;
2963             h->mb.i_partition = i_partition;
2964
2965             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2966                 x264_intra_rd_refine( h, &analysis );
2967             if( h->mb.i_subpel_refine >= 5 )
2968                 x264_refine_bidir( h, &analysis );
2969
2970             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2971             {
2972                 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2973                 x264_analyse_update_cache( h, &analysis );
2974
2975                 if( i_partition == D_16x16 )
2976                 {
2977                     if( i_type == B_L0_L0 )
2978                     {
2979                         analysis.l0.me16x16.cost = i_cost;
2980                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2981                     }
2982                     else if( i_type == B_L1_L1 )
2983                     {
2984                         analysis.l1.me16x16.cost = i_cost;
2985                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2986                     }
2987                     else if( i_type == B_BI_BI )
2988                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
2989                 }
2990                 else if( i_partition == D_16x8 )
2991                 {
2992                     for( i = 0; i < 2; i++ )
2993                     {
2994                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2995                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2996                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2997                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2998                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2999                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3000                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3001                     }
3002                 }
3003                 else if( i_partition == D_8x16 )
3004                 {
3005                     for( i = 0; i < 2; i++ )
3006                     {
3007                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3008                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3009                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3010                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3011                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3012                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3013                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3014                     }
3015                 }
3016                 else if( i_partition == D_8x8 )
3017                 {
3018                     for( i = 0; i < 4; i++ )
3019                     {
3020                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3021                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3022                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3023                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3024                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3025                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3026                     }
3027                 }
3028             }
3029         }
3030     }
3031
3032     x264_analyse_update_cache( h, &analysis );
3033
3034     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3035      * without realizing it.  Check for this and account for it if necessary. */
3036     if( analysis.i_mbrd >= 2 )
3037     {
3038         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3039         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3040         int list = check_mv_lists[h->mb.i_type] - 1;
3041         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3042             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3043             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3044                 h->mb.i_partition = D_16x16;
3045     }
3046
3047     if( !analysis.i_mbrd )
3048         x264_mb_analyse_transform( h );
3049
3050     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3051         x264_mb_analyse_qp_rd( h, &analysis );
3052
3053     h->mb.b_trellis = h->param.analyse.i_trellis;
3054     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3055     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3056         x264_psy_trellis_init( h, 0 );
3057     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3058         h->mb.i_skip_intra = 0;
3059 }
3060
3061 /*-------------------- Update MB from the analysis ----------------------*/
3062 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3063 {
3064     int i;
3065
3066     switch( h->mb.i_type )
3067     {
3068         case I_4x4:
3069             for( i = 0; i < 16; i++ )
3070                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3071
3072             x264_mb_analyse_intra_chroma( h, a );
3073             break;
3074         case I_8x8:
3075             for( i = 0; i < 4; i++ )
3076                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3077
3078             x264_mb_analyse_intra_chroma( h, a );
3079             break;
3080         case I_16x16:
3081             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3082             x264_mb_analyse_intra_chroma( h, a );
3083             break;
3084
3085         case I_PCM:
3086             break;
3087
3088         case P_L0:
3089             switch( h->mb.i_partition )
3090             {
3091                 case D_16x16:
3092                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3093                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3094                     break;
3095
3096                 case D_16x8:
3097                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3098                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3099                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3100                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3101                     break;
3102
3103                 case D_8x16:
3104                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3105                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3106                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3107                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3108                     break;
3109
3110                 default:
3111                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3112                     break;
3113             }
3114             break;
3115
3116         case P_8x8:
3117             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3118             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3119             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3120             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3121             for( i = 0; i < 4; i++ )
3122                 x264_mb_cache_mv_p8x8( h, a, i );
3123             break;
3124
3125         case P_SKIP:
3126         {
3127             h->mb.i_partition = D_16x16;
3128             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3129             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3130             break;
3131         }
3132
3133         case B_SKIP:
3134         case B_DIRECT:
3135             h->mb.i_partition = h->mb.cache.direct_partition;
3136             x264_mb_load_mv_direct8x8( h, 0 );
3137             x264_mb_load_mv_direct8x8( h, 1 );
3138             x264_mb_load_mv_direct8x8( h, 2 );
3139             x264_mb_load_mv_direct8x8( h, 3 );
3140             break;
3141
3142         case B_8x8:
3143             /* optimize: cache might not need to be rewritten */
3144             for( i = 0; i < 4; i++ )
3145                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3146             break;
3147
3148         default: /* the rest of the B types */
3149             switch( h->mb.i_partition )
3150             {
3151             case D_16x16:
3152                 switch( h->mb.i_type )
3153                 {
3154                 case B_L0_L0:
3155                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3156                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3157
3158                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3159                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3160                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3161                     break;
3162                 case B_L1_L1:
3163                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3164                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3165                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3166
3167                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3168                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3169                     break;
3170                 case B_BI_BI:
3171                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3172                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3173
3174                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3175                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3176                     break;
3177                 }
3178                 break;
3179             case D_16x8:
3180                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3181                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3182                 break;
3183             case D_8x16:
3184                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3185                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3186                 break;
3187             default:
3188                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3189                 break;
3190             }
3191     }
3192
3193 #ifndef NDEBUG
3194     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3195     {
3196         int l;
3197         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3198         {
3199             int completed;
3200             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3201             if( ref < 0 )
3202                 continue;
3203             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3204             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3205             {
3206                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3207                 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3208                 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3209                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3210                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3211                 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3212                 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3213                 fprintf(stderr, "completed: %d \n", completed );
3214                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3215                 x264_mb_analyse_intra( h, a, COST_MAX );
3216                 h->mb.i_type = I_16x16;
3217                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3218                 x264_mb_analyse_intra_chroma( h, a );
3219             }
3220         }
3221     }
3222 #endif
3223 }
3224
3225 #include "slicetype.c"
3226