git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int i_ref;
  41     int       i_rd16x16;
  42     x264_me_t me16x16;
  43     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  44
  45     /* 8x8 */
  46     int       i_cost8x8;
  47     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  48     ALIGNED_4( int16_t mvc[32][5][2] );
  49     x264_me_t me8x8[4];
  50
  51     /* Sub 4x4 */
  52     int       i_cost4x4[4]; /* cost per 8x8 partition */
  53     x264_me_t me4x4[4][4];
  54
  55     /* Sub 8x4 */
  56     int       i_cost8x4[4]; /* cost per 8x8 partition */
  57     x264_me_t me8x4[4][2];
  58
  59     /* Sub 4x8 */
  60     int       i_cost4x8[4]; /* cost per 8x8 partition */
  61     x264_me_t me4x8[4][2];
  62
  63     /* 16x8 */
  64     int       i_cost16x8;
  65     x264_me_t me16x8[2];
  66
  67     /* 8x16 */
  68     int       i_cost8x16;
  69     x264_me_t me8x16[2];
  70
  71 } x264_mb_analysis_list_t;
  72
  73 typedef struct
  74 {
  75     /* conduct the analysis using this lamda and QP */
  76     int i_lambda;
  77     int i_lambda2;
  78     int i_qp;
  79     uint16_t *p_cost_mv;
  80     uint16_t *p_cost_ref[2];
  81     int i_mbrd;
  82
  83
  84     /* I: Intra part */
  85     /* Take some shortcuts in intra search if intra is deemed unlikely */
  86     int b_fast_intra;
  87     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  88     int b_try_pskip;
  89
  90     /* Luma part */
  91     int i_satd_i16x16;
  92     int i_satd_i16x16_dir[7];
  93     int i_predict16x16;
  94
  95     int i_satd_i8x8;
  96     int i_cbp_i8x8_luma;
  97     int i_satd_i8x8_dir[12][4];
  98     int i_predict8x8[4];
  99
 100     int i_satd_i4x4;
 101     int i_predict4x4[16];
 102
 103     int i_satd_pcm;
 104
 105     /* Chroma part */
 106     int i_satd_i8x8chroma;
 107     int i_satd_i8x8chroma_dir[7];
 108     int i_predict8x8chroma;
 109
 110     /* II: Inter part P/B frame */
 111     x264_mb_analysis_list_t l0;
 112     x264_mb_analysis_list_t l1;
 113
 114     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 115     int i_cost16x16direct;
 116     int i_cost8x8bi;
 117     int i_cost8x8direct[4];
 118     int i_cost16x8bi;
 119     int i_cost8x16bi;
 120     int i_rd16x16bi;
 121     int i_rd16x16direct;
 122     int i_rd16x8bi;
 123     int i_rd8x16bi;
 124     int i_rd8x8bi;
 125
 126     int i_mb_partition16x8[2]; /* mb_partition_e */
 127     int i_mb_partition8x16[2];
 128     int i_mb_type16x8; /* mb_class_e */
 129     int i_mb_type8x16;
 130
 131     int b_direct_available;
 132
 133 } x264_mb_analysis_t;
 134
 135 /* lambda = pow(2,qp/6-2) */
 136 const uint8_t x264_lambda_tab[52] = {
 137    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 138    1, 1, 1, 1,              /*  8-11 */
 139    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 140    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 141    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 142   16,18,20,23,25,29,32,36,  /* 36-43 */
 143   40,45,51,57,64,72,81,91   /* 44-51 */
 144 };
 145
 146 /* lambda2 = pow(lambda,2) * .9 * 256 */
 147 const int x264_lambda2_tab[52] = {
 148     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 149     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 150    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 151   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 152  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 153 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 154 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 155 };
 156
 157 const uint8_t x264_exp2_lut[64] = {
 158       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 159      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 160     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 161     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 162 };
 163
 164 const float x264_log2_lut[128] = {
 165     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 166     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 167     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 168     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 169     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 170     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 171     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 172     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 173     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 174     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 175     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 176     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 177     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 178     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 179     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 180     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 181 };
 182
 183 /* Avoid an int/float conversion. */
 184 const float x264_log2_lz_lut[32] = {
 185     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 186 };
 187
 188 // should the intra and inter lambdas be different?
 189 // I'm just matching the behaviour of deadzone quant.
 190 static const int x264_trellis_lambda2_tab[2][52] = {
 191     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 192     {    46,      58,      73,      92,     117,     147,
 193         185,     233,     294,     370,     466,     587,
 194         740,     932,    1174,    1480,    1864,    2349,
 195        2959,    3728,    4697,    5918,    7457,    9395,
 196       11837,   14914,   18790,   23674,   29828,   37581,
 197       47349,   59656,   75163,   94699,  119313,  150326,
 198      189399,  238627,  300652,  378798,  477255,  601304,
 199      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 200     3030384, 3818045, 4810435, 6060769 },
 201     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 202     {    27,      34,      43,      54,      68,      86,
 203         108,     136,     172,     216,     273,     343,
 204         433,     545,     687,     865,    1090,    1374,
 205        1731,    2180,    2747,    3461,    4361,    5494,
 206        6922,    8721,   10988,   13844,   17442,   21976,
 207       27688,   34885,   43953,   55377,   69771,   87906,
 208      110755,  139543,  175813,  221511,  279087,  351627,
 209      443023,  558174,  703255,  886046, 1116348, 1406511,
 210     1772093, 2232697, 2813022, 3544186 }
 211 };
 212
 213 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 214        16,    20,    25,    32,    40,    50,
 215        64,    80,   101,   128,   161,   203,
 216       256,   322,   406,   512,   645,   812,
 217      1024,  1290,  1625,  2048,  2580,  3250,
 218      4096,  5160,  6501,  8192, 10321, 13003,
 219     16384, 20642, 26007, 32768, 41285, 52015,
 220     65535
 221 };
 222
 223 /* TODO: calculate CABAC costs */
 224 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 225     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 226 };
 227 static const uint8_t i_mb_b16x8_cost_table[17] = {
 228     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 229 };
 230 static const uint8_t i_sub_mb_b_cost_table[13] = {
 231     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 232 };
 233 static const uint8_t i_sub_mb_p_cost_table[4] = {
 234     5, 3, 3, 1
 235 };
 236
 237 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 238
 239 static uint16_t x264_cost_ref[92][3][33];
 240 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 241
 242 int x264_analyse_init_costs( x264_t *h, int qp )
 243 {
 244     int i, j;
 245     int lambda = x264_lambda_tab[qp];
 246     if( h->cost_mv[lambda] )
 247         return 0;
 248     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 249     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 250     h->cost_mv[lambda] += 2*4*2048;
 251     for( i = 0; i <= 2*4*2048; i++ )
 252     {
 253         h->cost_mv[lambda][-i] =
 254         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 255     }
 256     x264_pthread_mutex_lock( &cost_ref_mutex );
 257     for( i = 0; i < 3; i++ )
 258         for( j = 0; j < 33; j++ )
 259             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 260     x264_pthread_mutex_unlock( &cost_ref_mutex );
 261     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 262     {
 263         for( j=0; j<4; j++ )
 264         {
 265             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 266             h->cost_mv_fpel[lambda][j] += 2*2048;
 267             for( i = -2*2048; i < 2*2048; i++ )
 268                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 269         }
 270     }
 271     return 0;
 272 fail:
 273     return -1;
 274 }
 275
 276 void x264_analyse_free_costs( x264_t *h )
 277 {
 278     int i, j;
 279     for( i = 0; i < 92; i++ )
 280     {
 281         if( h->cost_mv[i] )
 282             x264_free( h->cost_mv[i] - 2*4*2048 );
 283         if( h->cost_mv_fpel[i][0] )
 284             for( j = 0; j < 4; j++ )
 285                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 286     }
 287 }
 288
 289 void x264_analyse_weight_frame( x264_t *h, int end )
 290 {
 291     int j;
 292     for( j=0; j<h->i_ref0; j++ )
 293     {
 294         if( h->sh.weight[j][0].weightfn )
 295         {
 296             x264_frame_t *frame = h->fref0[j];
 297             int width = frame->i_width[0] + 2*PADH;
 298             int i_padv = PADV << h->param.b_interlaced;
 299             int offset, height;
 300             uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 301             int k;
 302             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 303             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 304             h->fenc->i_lines_weighted += height;
 305             if( height )
 306             {
 307                 for( k = j; k < h->i_ref0; k++ )
 308                     if( h->sh.weight[k][0].weightfn )
 309                     {
 310                         uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 311                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 312                                                  src + offset, frame->i_stride[0],
 313                                                  width, height, &h->sh.weight[k][0] );
 314                     }
 315             }
 316             break;
 317         }
 318     }
 319 }
 320
 321 /* initialize an array of lambda*nbits for all possible mvs */
 322 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 323 {
 324     a->p_cost_mv = h->cost_mv[a->i_lambda];
 325     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 326     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 327 }
 328
 329 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 330 {
 331     /* conduct the analysis using this lamda and QP */
 332     a->i_qp = h->mb.i_qp = i_qp;
 333     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 334
 335     a->i_lambda = x264_lambda_tab[i_qp];
 336     a->i_lambda2 = x264_lambda2_tab[i_qp];
 337
 338     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 339     if( h->param.analyse.i_trellis )
 340     {
 341         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 342         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 343         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 344         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 345     }
 346     h->mb.i_psy_rd_lambda = a->i_lambda;
 347     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 348     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 349
 350 }
 351
 352 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 353 {
 354     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 355
 356     /* mbrd == 1 -> RD mode decision */
 357     /* mbrd == 2 -> RD refinement */
 358     /* mbrd == 3 -> QPRD */
 359     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 360
 361     x264_mb_analyse_init_qp( h, a, i_qp );
 362
 363     h->mb.i_me_method = h->param.analyse.i_me_method;
 364     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 365     if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
 366         h->mb.i_subpel_refine--;
 367     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 368                         && h->mb.i_subpel_refine >= 5;
 369     h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
 370                           (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
 371
 372     h->mb.b_transform_8x8 = 0;
 373     h->mb.b_noise_reduction = 0;
 374
 375     /* I: Intra part */
 376     a->i_satd_i16x16 =
 377     a->i_satd_i8x8   =
 378     a->i_satd_i4x4   =
 379     a->i_satd_i8x8chroma = COST_MAX;
 380
 381     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 382     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 383
 384     a->b_fast_intra = 0;
 385     h->mb.i_skip_intra =
 386         h->mb.b_lossless ? 0 :
 387         a->i_mbrd ? 2 :
 388         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 389
 390     /* II: Inter part P/B frame */
 391     if( h->sh.i_type != SLICE_TYPE_I )
 392     {
 393         int i, j;
 394         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 395         // limit motion search to a slightly smaller range than the theoretical limit,
 396         // since the search may go a few iterations past its given range
 397         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 398
 399         /* Calculate max allowed MV range */
 400 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 401         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 402         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 403         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 404         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 405         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 406         {
 407             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 408             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 409             /* If we're left of the refresh bar, don't reference right of it. */
 410             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 411                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 412         }
 413         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 414         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 415         if( h->mb.i_mb_x == 0 )
 416         {
 417             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 418             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 419             int thread_mvy_range = i_fmv_range;
 420
 421             if( h->i_thread_frames > 1 )
 422             {
 423                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 424                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 425                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 426                 {
 427                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 428                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 429                     for( j=0; j<i_ref; j++ )
 430                     {
 431                         x264_frame_cond_wait( fref[j]->orig, thresh );
 432                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 433                     }
 434                 }
 435
 436                 if( h->param.b_deterministic )
 437                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 438                 if( h->mb.b_interlaced )
 439                     thread_mvy_range >>= 1;
 440
 441                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 442             }
 443
 444             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 445             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 446             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 447             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 448             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 449             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 450             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 451         }
 452 #undef CLIP_FMV
 453
 454         a->l0.me16x16.cost =
 455         a->l0.i_rd16x16    =
 456         a->l0.i_cost8x8    = COST_MAX;
 457
 458         for( i = 0; i < 4; i++ )
 459         {
 460             a->l0.i_cost4x4[i] =
 461             a->l0.i_cost8x4[i] =
 462             a->l0.i_cost4x8[i] = COST_MAX;
 463         }
 464
 465         a->l0.i_cost16x8   =
 466         a->l0.i_cost8x16   = COST_MAX;
 467         if( h->sh.i_type == SLICE_TYPE_B )
 468         {
 469             a->l1.me16x16.cost =
 470             a->l1.i_rd16x16    =
 471             a->l1.i_cost8x8    = COST_MAX;
 472
 473             for( i = 0; i < 4; i++ )
 474             {
 475                 a->l1.i_cost4x4[i] =
 476                 a->l1.i_cost8x4[i] =
 477                 a->l1.i_cost4x8[i] =
 478                 a->i_cost8x8direct[i] = COST_MAX;
 479             }
 480
 481             a->l1.i_cost16x8   =
 482             a->l1.i_cost8x16   =
 483             a->i_rd16x16bi     =
 484             a->i_rd16x16direct =
 485             a->i_rd8x8bi       =
 486             a->i_rd16x8bi      =
 487             a->i_rd8x16bi      =
 488             a->i_cost16x16bi   =
 489             a->i_cost16x16direct =
 490             a->i_cost8x8bi     =
 491             a->i_cost16x8bi    =
 492             a->i_cost8x16bi    = COST_MAX;
 493         }
 494
 495         /* Fast intra decision */
 496         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 497         {
 498             if(   IS_INTRA( h->mb.i_mb_type_left )
 499                || IS_INTRA( h->mb.i_mb_type_top )
 500                || IS_INTRA( h->mb.i_mb_type_topleft )
 501                || IS_INTRA( h->mb.i_mb_type_topright )
 502                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 503                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 504             { /* intra is likely */ }
 505             else
 506             {
 507                 a->b_fast_intra = 1;
 508             }
 509         }
 510         h->mb.b_skip_mc = 0;
 511         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 512             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 513         {
 514             a->b_force_intra = 1;
 515             a->b_fast_intra = 0;
 516         }
 517         else
 518             a->b_force_intra = 0;
 519     }
 520 }
 521
 522 /* Prediction modes allowed for various combinations of neighbors. */
 523 /* Terminated by a -1. */
 524 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 525 static const int8_t i16x16_mode_available[5][5] =
 526 {
 527     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 528     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 529     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 530     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 531     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 532 };
 533
 534 static const int8_t i8x8chroma_mode_available[5][5] =
 535 {
 536     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 537     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 538     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 539     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 540     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 541 };
 542
 543 static const int8_t i4x4_mode_available[5][10] =
 544 {
 545     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 546     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 547     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 548     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 549     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 550 };
 551
 552 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
 553 {
 554     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 555     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 556 }
 557
 558 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 559 {
 560     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 561     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 562 }
 563
 564 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
 565 {
 566     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 567     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 568 }
 569
 570 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 571 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 572 {
 573     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 574
 575     if( do_both_dct || h->mb.b_transform_8x8 )
 576         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 577     if( do_both_dct || !h->mb.b_transform_8x8 )
 578         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 579 }
 580
 581 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
 582 static inline void x264_mb_cache_fenc_satd( x264_t *h )
 583 {
 584     ALIGNED_16( static uint8_t zero[16] ) = {0};
 585     uint8_t *fenc;
 586     int x, y, satd_sum = 0, sa8d_sum = 0;
 587     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 588         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 589     if( !h->mb.i_psy_rd )
 590         return;
 591     for( y = 0; y < 4; y++ )
 592         for( x = 0; x < 4; x++ )
 593         {
 594             fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
 595             h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
 596                                       - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
 597             satd_sum += h->mb.pic.fenc_satd[y][x];
 598         }
 599     for( y = 0; y < 2; y++ )
 600         for( x = 0; x < 2; x++ )
 601         {
 602             fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
 603             h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
 604                                       - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
 605             sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
 606         }
 607     h->mb.pic.fenc_satd_sum = satd_sum;
 608     h->mb.pic.fenc_sa8d_sum = sa8d_sum;
 609 }
 610
 611 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 612 {
 613     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 614
 615     if( a->i_satd_i8x8chroma < COST_MAX )
 616         return;
 617
 618     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 619
 620     /* 8x8 prediction selection for chroma */
 621     if( predict_mode[3] >= 0 && b_merged_satd )
 622     {
 623         int satdu[4], satdv[4];
 624         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 625         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 626         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 627         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 628         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 629         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 630
 631         for( ; *predict_mode >= 0; predict_mode++ )
 632         {
 633             int i_mode = *predict_mode;
 634             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 635
 636             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 637             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 638         }
 639     }
 640     else
 641     {
 642         for( ; *predict_mode >= 0; predict_mode++ )
 643         {
 644             int i_satd;
 645             int i_mode = *predict_mode;
 646
 647             /* we do the prediction */
 648             if( h->mb.b_lossless )
 649                 x264_predict_lossless_8x8_chroma( h, i_mode );
 650             else
 651             {
 652                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 653                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 654             }
 655
 656             /* we calculate the cost */
 657             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 658                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 659                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 660
 661             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 662             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 663         }
 664     }
 665
 666     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 667 }
 668
 669 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 670 {
 671     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 672     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 673     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 674
 675     int i, idx;
 676     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 677
 678     /*---------------- Try all mode and calculate their score ---------------*/
 679
 680     /* 16x16 prediction selection */
 681     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 682
 683     if( b_merged_satd && predict_mode[3] >= 0 )
 684     {
 685         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 686         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 687         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 688             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 689         for( i=0; i<4; i++ )
 690         {
 691             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 692             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 693         }
 694     }
 695     else
 696     {
 697         for( ; *predict_mode >= 0; predict_mode++ )
 698         {
 699             int i_satd;
 700             int i_mode = *predict_mode;
 701
 702             if( h->mb.b_lossless )
 703                 x264_predict_lossless_16x16( h, i_mode );
 704             else
 705                 h->predict_16x16[i_mode]( p_dst );
 706
 707             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 708                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 709             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 710             a->i_satd_i16x16_dir[i_mode] = i_satd;
 711         }
 712     }
 713
 714     if( h->sh.i_type == SLICE_TYPE_B )
 715         /* cavlc mb type prefix */
 716         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 717     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 718         return;
 719
 720     /* 8x8 prediction selection */
 721     if( flags & X264_ANALYSE_I8x8 )
 722     {
 723         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 724         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 725         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 726         int i_cost = 0;
 727         h->mb.i_cbp_luma = 0;
 728         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 729
 730         // FIXME some bias like in i4x4?
 731         if( h->sh.i_type == SLICE_TYPE_B )
 732             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 733
 734         for( idx = 0;; idx++ )
 735         {
 736             int x = idx&1;
 737             int y = idx>>1;
 738             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 739             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 740             int i_best = COST_MAX;
 741             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 742
 743             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 744             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 745
 746             if( b_merged_satd && predict_mode[8] >= 0 )
 747             {
 748                 int satd[9];
 749                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 750                 satd[i_pred_mode] -= 3 * a->i_lambda;
 751                 for( i=2; i>=0; i-- )
 752                 {
 753                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 754                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 755                 }
 756                 predict_mode += 3;
 757             }
 758
 759             for( ; *predict_mode >= 0; predict_mode++ )
 760             {
 761                 int i_satd;
 762                 int i_mode = *predict_mode;
 763
 764                 if( h->mb.b_lossless )
 765                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 766                 else
 767                     h->predict_8x8[i_mode]( p_dst_by, edge );
 768
 769                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
 770                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 771                     i_satd -= a->i_lambda * 3;
 772
 773                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 774                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 775             }
 776             i_cost += i_best;
 777
 778             if( idx == 3 || i_cost > i_satd_thresh )
 779                 break;
 780
 781             /* we need to encode this block now (for next ones) */
 782             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 783             x264_mb_encode_i8x8( h, idx, a->i_qp );
 784
 785             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 786         }
 787
 788         if( idx == 3 )
 789         {
 790             a->i_satd_i8x8 = i_cost;
 791             if( h->mb.i_skip_intra )
 792             {
 793                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 794                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 795                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 796                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 797                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 798                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 799                 if( h->mb.i_skip_intra == 2 )
 800                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 801             }
 802         }
 803         else
 804         {
 805             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 806             a->i_satd_i8x8 = COST_MAX;
 807             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 808         }
 809         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 810             return;
 811     }
 812
 813     /* 4x4 prediction selection */
 814     if( flags & X264_ANALYSE_I4x4 )
 815     {
 816         int i_cost;
 817         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 818         h->mb.i_cbp_luma = 0;
 819         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 820         if( a->i_mbrd )
 821             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 822
 823         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 824         if( h->sh.i_type == SLICE_TYPE_B )
 825             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 826
 827         for( idx = 0;; idx++ )
 828         {
 829             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 830             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 831             int i_best = COST_MAX;
 832             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 833
 834             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 835
 836             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 837                 /* emulate missing topright samples */
 838                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 839
 840             if( b_merged_satd && predict_mode[5] >= 0 )
 841             {
 842                 int satd[9];
 843                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 844                 satd[i_pred_mode] -= 3 * a->i_lambda;
 845                 for( i=2; i>=0; i-- )
 846                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 847                 predict_mode += 3;
 848             }
 849
 850             for( ; *predict_mode >= 0; predict_mode++ )
 851             {
 852                 int i_satd;
 853                 int i_mode = *predict_mode;
 854
 855                 if( h->mb.b_lossless )
 856                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 857                 else
 858                     h->predict_4x4[i_mode]( p_dst_by );
 859
 860                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 861                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 862                     i_satd -= a->i_lambda * 3;
 863
 864                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 865             }
 866             i_cost += i_best + 4 * a->i_lambda;
 867
 868             if( i_cost > i_satd_thresh || idx == 15 )
 869                 break;
 870
 871             /* we need to encode this block now (for next ones) */
 872             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 873             x264_mb_encode_i4x4( h, idx, a->i_qp );
 874
 875             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 876         }
 877         if( idx == 15 )
 878         {
 879             a->i_satd_i4x4 = i_cost;
 880             if( h->mb.i_skip_intra )
 881             {
 882                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 883                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 884                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 885                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 886                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 887                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 888                 if( h->mb.i_skip_intra == 2 )
 889                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 890             }
 891         }
 892         else
 893             a->i_satd_i4x4 = COST_MAX;
 894     }
 895 }
 896
 897 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 898 {
 899     if( a->i_satd_i16x16 <= i_satd_thresh )
 900     {
 901         h->mb.i_type = I_16x16;
 902         x264_analyse_update_cache( h, a );
 903         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 904     }
 905     else
 906         a->i_satd_i16x16 = COST_MAX;
 907
 908     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 909     {
 910         h->mb.i_type = I_4x4;
 911         x264_analyse_update_cache( h, a );
 912         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 913     }
 914     else
 915         a->i_satd_i4x4 = COST_MAX;
 916
 917     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 918     {
 919         h->mb.i_type = I_8x8;
 920         x264_analyse_update_cache( h, a );
 921         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 922         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 923     }
 924     else
 925         a->i_satd_i8x8 = COST_MAX;
 926 }
 927
 928 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 929 {
 930     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 931
 932     int i, idx, x, y;
 933     int i_mode, i_thresh;
 934     uint64_t i_satd, i_best;
 935     h->mb.i_skip_intra = 0;
 936
 937     if( h->mb.i_type == I_16x16 )
 938     {
 939         int old_pred_mode = a->i_predict16x16;
 940         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 941         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 942         i_best = a->i_satd_i16x16;
 943         for( ; *predict_mode >= 0; predict_mode++ )
 944         {
 945             int i_mode = *predict_mode;
 946             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 947                 continue;
 948             h->mb.i_intra16x16_pred_mode = i_mode;
 949             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 950             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 951         }
 952     }
 953
 954     /* RD selection for chroma prediction */
 955     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 956     if( predict_mode[1] >= 0 )
 957     {
 958         int8_t predict_mode_sorted[4];
 959         int i_max;
 960         i_thresh = a->i_satd_i8x8chroma * 5/4;
 961
 962         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 963         {
 964             i_mode = *predict_mode;
 965             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 966                 predict_mode_sorted[i_max++] = i_mode;
 967         }
 968
 969         if( i_max > 0 )
 970         {
 971             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 972             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 973             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 974              * coefs for the current chroma mode are still around, so we only
 975              * have to recount the bits. */
 976             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 977             for( i = 0; i < i_max; i++ )
 978             {
 979                 i_mode = predict_mode_sorted[i];
 980                 if( h->mb.b_lossless )
 981                     x264_predict_lossless_8x8_chroma( h, i_mode );
 982                 else
 983                 {
 984                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 985                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 986                 }
 987                 /* if we've already found a mode that needs no residual, then
 988                  * probably any mode with a residual will be worse.
 989                  * so avoid dct on the remaining modes to improve speed. */
 990                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 991                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 992             }
 993             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 994             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 995         }
 996     }
 997
 998     if( h->mb.i_type == I_4x4 )
 999     {
1000         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1001         int i_nnz = 0;
1002         for( idx = 0; idx < 16; idx++ )
1003         {
1004             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1005             i_best = COST_MAX64;
1006
1007             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1008
1009             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1010                 /* emulate missing topright samples */
1011                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1012
1013             for( ; *predict_mode >= 0; predict_mode++ )
1014             {
1015                 i_mode = *predict_mode;
1016                 if( h->mb.b_lossless )
1017                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1018                 else
1019                     h->predict_4x4[i_mode]( p_dst_by );
1020                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1021
1022                 if( i_best > i_satd )
1023                 {
1024                     a->i_predict4x4[idx] = i_mode;
1025                     i_best = i_satd;
1026                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1027                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1028                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1029                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1030                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1031                 }
1032             }
1033
1034             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1035             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1036             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1037             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1038             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1039
1040             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1041         }
1042     }
1043     else if( h->mb.i_type == I_8x8 )
1044     {
1045         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1046         for( idx = 0; idx < 4; idx++ )
1047         {
1048             uint64_t pels_h = 0;
1049             uint8_t pels_v[7];
1050             uint16_t i_nnz[2] = {0}; //shut up gcc
1051             uint8_t *p_dst_by;
1052             int j;
1053             int cbp_luma_new = 0;
1054             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1055
1056             i_best = COST_MAX64;
1057             x = idx&1;
1058             y = idx>>1;
1059
1060             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1061             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1062             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1063
1064             for( ; *predict_mode >= 0; predict_mode++ )
1065             {
1066                 i_mode = *predict_mode;
1067                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1068                     continue;
1069
1070                 if( h->mb.b_lossless )
1071                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1072                 else
1073                     h->predict_8x8[i_mode]( p_dst_by, edge );
1074                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1075                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1076
1077                 if( i_best > i_satd )
1078                 {
1079                     a->i_predict8x8[idx] = i_mode;
1080                     cbp_luma_new = h->mb.i_cbp_luma;
1081                     i_best = i_satd;
1082
1083                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1084                     if( !(idx&1) )
1085                         for( j=0; j<7; j++ )
1086                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1087                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1088                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1089                 }
1090             }
1091             a->i_cbp_i8x8_luma = cbp_luma_new;
1092             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1093             if( !(idx&1) )
1094                 for( j=0; j<7; j++ )
1095                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1096             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1097             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1098
1099             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1100         }
1101     }
1102 }
1103
1104 #define LOAD_FENC( m, src, xoff, yoff) \
1105     (m)->p_cost_mv = a->p_cost_mv; \
1106     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1107     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1108     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1109     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1110     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1111
1112 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1113     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1114     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1116     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1117     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1118     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1119     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1120     (m)->weight = weight_none; \
1121     (m)->i_ref = ref;
1122
1123 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1124     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1125     (m)->weight = h->sh.weight[i_ref];
1126
1127 #define REF_COST(list, ref) \
1128     (a->p_cost_ref[list][ref])
1129
1130 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1131 {
1132     x264_me_t m;
1133     int i_ref, i_mvc;
1134     ALIGNED_4( int16_t mvc[8][2] );
1135     int i_halfpel_thresh = INT_MAX;
1136     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1137
1138     /* 16x16 Search on all ref frame */
1139     m.i_pixel = PIXEL_16x16;
1140     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1141
1142     a->l0.me16x16.cost = INT_MAX;
1143     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1144     {
1145         const int i_ref_cost = REF_COST( 0, i_ref );
1146         i_halfpel_thresh -= i_ref_cost;
1147         m.i_ref_cost = i_ref_cost;
1148
1149         /* search with ref */
1150         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1151         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1152
1153         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1154         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1155
1156         if( h->mb.ref_blind_dupe == i_ref )
1157         {
1158             CP32( m.mv, a->l0.mvc[0][0] );
1159             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1160         }
1161         else
1162             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1163
1164         /* early termination
1165          * SSD threshold would probably be better than SATD */
1166         if( i_ref == 0
1167             && a->b_try_pskip
1168             && m.cost-m.cost_mv < 300*a->i_lambda
1169             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1170               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1171             && x264_macroblock_probe_pskip( h ) )
1172         {
1173             h->mb.i_type = P_SKIP;
1174             x264_analyse_update_cache( h, a );
1175             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1176             return;
1177         }
1178
1179         m.cost += i_ref_cost;
1180         i_halfpel_thresh += i_ref_cost;
1181
1182         if( m.cost < a->l0.me16x16.cost )
1183             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1184
1185         /* save mv for predicting neighbors */
1186         CP32( a->l0.mvc[i_ref][0], m.mv );
1187         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1188     }
1189
1190     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1191     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1192
1193     h->mb.i_type = P_L0;
1194     if( a->i_mbrd )
1195     {
1196         x264_mb_cache_fenc_satd( h );
1197         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1198         {
1199             h->mb.i_partition = D_16x16;
1200             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1201             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1202             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1203                 h->mb.i_type = P_SKIP;
1204         }
1205     }
1206 }
1207
1208 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1209 {
1210     x264_me_t m;
1211     int i_ref;
1212     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1213     int i_halfpel_thresh = INT_MAX;
1214     int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1215     int i;
1216     int i_maxref = h->mb.pic.i_fref[0]-1;
1217
1218     h->mb.i_partition = D_8x8;
1219
1220     #define CHECK_NEIGHBOUR(i)\
1221     {\
1222         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1223         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1224             i_maxref = ref;\
1225     }
1226
1227     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1228      * than those used by the neighbors */
1229     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1230         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1231     {
1232         i_maxref = 0;
1233         CHECK_NEIGHBOUR(  -8 - 1 );
1234         CHECK_NEIGHBOUR(  -8 + 0 );
1235         CHECK_NEIGHBOUR(  -8 + 2 );
1236         CHECK_NEIGHBOUR(  -8 + 4 );
1237         CHECK_NEIGHBOUR(   0 - 1 );
1238         CHECK_NEIGHBOUR( 2*8 - 1 );
1239     }
1240
1241     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1242         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1243
1244     for( i = 0; i < 4; i++ )
1245     {
1246         x264_me_t *l0m = &a->l0.me8x8[i];
1247         const int x8 = i%2;
1248         const int y8 = i/2;
1249
1250         m.i_pixel = PIXEL_8x8;
1251
1252         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1253         l0m->cost = INT_MAX;
1254         for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1255         {
1256             const int i_ref_cost = REF_COST( 0, i_ref );
1257             m.i_ref_cost = i_ref_cost;
1258
1259             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1260             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1261
1262             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1263             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1264             if( h->mb.ref_blind_dupe == i_ref )
1265             {
1266                 CP32( m.mv, a->l0.mvc[0][i+1] );
1267                 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1268             }
1269             else
1270                 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1271
1272             m.cost += i_ref_cost;
1273             i_halfpel_thresh += i_ref_cost;
1274             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1275
1276             if( m.cost < l0m->cost )
1277                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1278             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1279                 i_ref = h->mb.ref_blind_dupe;
1280             else
1281                 i_ref++;
1282         }
1283         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1284         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1285
1286         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1287            are effectively zero. */
1288         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1289             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1290     }
1291
1292     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1293                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1294     /* P_8x8 ref0 has no ref cost */
1295     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1296                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1297         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1298     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1299     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1300 }
1301
1302 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1303 {
1304     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1305      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1306      * don't bother analysing the dupes. */
1307     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1308     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1309     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1310     int i_mvc;
1311     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1312     int i;
1313
1314     /* XXX Needed for x264_mb_predict_mv */
1315     h->mb.i_partition = D_8x8;
1316
1317     i_mvc = 1;
1318     CP32( mvc[0], a->l0.me16x16.mv );
1319
1320     for( i = 0; i < 4; i++ )
1321     {
1322         x264_me_t *m = &a->l0.me8x8[i];
1323         const int x8 = i%2;
1324         const int y8 = i/2;
1325
1326         m->i_pixel = PIXEL_8x8;
1327         m->i_ref_cost = i_ref_cost;
1328
1329         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1330         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1331         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1332
1333         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1334         x264_me_search( h, m, mvc, i_mvc );
1335
1336         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1337
1338         CP32( mvc[i_mvc], m->mv );
1339         i_mvc++;
1340
1341         /* mb type cost */
1342         m->cost += i_ref_cost;
1343         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1344             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1345     }
1346
1347     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1348                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1349     /* theoretically this should include 4*ref_cost,
1350      * but 3 seems a better approximation of cabac. */
1351     if( h->param.b_cabac )
1352         a->l0.i_cost8x8 -= i_ref_cost;
1353     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1354     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1355 }
1356
1357 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1358 {
1359     x264_me_t m;
1360     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1361     ALIGNED_4( int16_t mvc[3][2] );
1362     int i, j;
1363
1364     /* XXX Needed for x264_mb_predict_mv */
1365     h->mb.i_partition = D_16x8;
1366
1367     for( i = 0; i < 2; i++ )
1368     {
1369         x264_me_t *l0m = &a->l0.me16x8[i];
1370         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1371         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1372         const int ref8[2] = { minref, maxref };
1373         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1374
1375         m.i_pixel = PIXEL_16x8;
1376
1377         LOAD_FENC( &m, p_fenc, 0, 8*i );
1378         l0m->cost = INT_MAX;
1379         for( j = 0; j < i_ref8s; j++ )
1380         {
1381             const int i_ref = ref8[j];
1382             const int i_ref_cost = REF_COST( 0, i_ref );
1383             m.i_ref_cost = i_ref_cost;
1384
1385             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1386             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1387             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1388             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1389
1390             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1391             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1392
1393             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1394             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1395             /* We can only take this shortcut if the first search was performed on ref0. */
1396             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1397             {
1398                 /* We can just leave the MV from the previous ref search. */
1399                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1400             }
1401             else
1402                 x264_me_search( h, &m, mvc, 3 );
1403
1404             m.cost += i_ref_cost;
1405
1406             if( m.cost < l0m->cost )
1407                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1408         }
1409         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1410         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1411     }
1412
1413     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1414 }
1415
1416 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1417 {
1418     x264_me_t m;
1419     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1420     ALIGNED_4( int16_t mvc[3][2] );
1421     int i, j;
1422
1423     /* XXX Needed for x264_mb_predict_mv */
1424     h->mb.i_partition = D_8x16;
1425
1426     for( i = 0; i < 2; i++ )
1427     {
1428         x264_me_t *l0m = &a->l0.me8x16[i];
1429         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1430         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1431         const int ref8[2] = { minref, maxref };
1432         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1433
1434         m.i_pixel = PIXEL_8x16;
1435
1436         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1437         l0m->cost = INT_MAX;
1438         for( j = 0; j < i_ref8s; j++ )
1439         {
1440             const int i_ref = ref8[j];
1441             const int i_ref_cost = REF_COST( 0, i_ref );
1442             m.i_ref_cost = i_ref_cost;
1443
1444             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1445             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1446             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1447
1448             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1449             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1450
1451             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1452             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1453             /* We can only take this shortcut if the first search was performed on ref0. */
1454             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1455             {
1456                 /* We can just leave the MV from the previous ref search. */
1457                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1458             }
1459             else
1460                 x264_me_search( h, &m, mvc, 3 );
1461
1462             m.cost += i_ref_cost;
1463
1464             if( m.cost < l0m->cost )
1465                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1466         }
1467         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1468         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1469     }
1470
1471     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1472 }
1473
1474 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1475 {
1476     ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1477     uint8_t *pix2 = pix1+8;
1478     const int i_stride = h->mb.pic.i_stride[1];
1479     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1480     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1481     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1482     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1483     x264_weight_t *weight = h->sh.weight[i_ref];
1484
1485 #define CHROMA4x4MC( width, height, me, x, y ) \
1486     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1487     if( weight[1].weightfn ) \
1488         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1489     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1490     if( weight[2].weightfn ) \
1491         weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1492
1493
1494     if( pixel == PIXEL_4x4 )
1495     {
1496         x264_me_t *m = a->l0.me4x4[i8x8];
1497         CHROMA4x4MC( 2,2, m[0], 0,0 );
1498         CHROMA4x4MC( 2,2, m[1], 2,0 );
1499         CHROMA4x4MC( 2,2, m[2], 0,2 );
1500         CHROMA4x4MC( 2,2, m[3], 2,2 );
1501     }
1502     else if( pixel == PIXEL_8x4 )
1503     {
1504         x264_me_t *m = a->l0.me8x4[i8x8];
1505         CHROMA4x4MC( 4,2, m[0], 0,0 );
1506         CHROMA4x4MC( 4,2, m[1], 0,2 );
1507     }
1508     else
1509     {
1510         x264_me_t *m = a->l0.me4x8[i8x8];
1511         CHROMA4x4MC( 2,4, m[0], 0,0 );
1512         CHROMA4x4MC( 2,4, m[1], 2,0 );
1513     }
1514
1515     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1516          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1517 }
1518
1519 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1520 {
1521     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1522     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1523     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1524     int i4x4;
1525
1526     /* XXX Needed for x264_mb_predict_mv */
1527     h->mb.i_partition = D_8x8;
1528
1529     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1530     {
1531         const int idx = 4*i8x8 + i4x4;
1532         const int x4 = block_idx_x[idx];
1533         const int y4 = block_idx_y[idx];
1534         const int i_mvc = (i4x4 == 0);
1535
1536         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1537
1538         m->i_pixel = PIXEL_4x4;
1539
1540         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1541         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1542         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1543
1544         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1545         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1546
1547         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1548     }
1549     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1550                             a->l0.me4x4[i8x8][1].cost +
1551                             a->l0.me4x4[i8x8][2].cost +
1552                             a->l0.me4x4[i8x8][3].cost +
1553                             REF_COST( 0, i_ref ) +
1554                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1555     if( h->mb.b_chroma_me )
1556         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1557 }
1558
1559 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1560 {
1561     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1562     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1563     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1564     int i8x4;
1565
1566     /* XXX Needed for x264_mb_predict_mv */
1567     h->mb.i_partition = D_8x8;
1568
1569     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1570     {
1571         const int idx = 4*i8x8 + 2*i8x4;
1572         const int x4 = block_idx_x[idx];
1573         const int y4 = block_idx_y[idx];
1574         const int i_mvc = (i8x4 == 0);
1575
1576         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1577
1578         m->i_pixel = PIXEL_8x4;
1579
1580         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1581         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1582         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1583
1584         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1585         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1586
1587         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1588     }
1589     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1590                             REF_COST( 0, i_ref ) +
1591                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1592     if( h->mb.b_chroma_me )
1593         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1594 }
1595
1596 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1597 {
1598     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1599     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1600     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1601     int i4x8;
1602
1603     /* XXX Needed for x264_mb_predict_mv */
1604     h->mb.i_partition = D_8x8;
1605
1606     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1607     {
1608         const int idx = 4*i8x8 + i4x8;
1609         const int x4 = block_idx_x[idx];
1610         const int y4 = block_idx_y[idx];
1611         const int i_mvc = (i4x8 == 0);
1612
1613         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1614
1615         m->i_pixel = PIXEL_4x8;
1616
1617         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1618         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1619         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1620
1621         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1622         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1623
1624         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1625     }
1626     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1627                             REF_COST( 0, i_ref ) +
1628                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1629     if( h->mb.b_chroma_me )
1630         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1631 }
1632
1633 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1634 {
1635     /* Assumes that fdec still contains the results of
1636      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1637
1638     uint8_t **p_fenc = h->mb.pic.p_fenc;
1639     uint8_t **p_fdec = h->mb.pic.p_fdec;
1640     int i;
1641
1642     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1643     for( i = 0; i < 4; i++ )
1644     {
1645         const int x = (i&1)*8;
1646         const int y = (i>>1)*8;
1647         a->i_cost16x16direct +=
1648         a->i_cost8x8direct[i] =
1649             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1650
1651         /* mb type cost */
1652         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1653     }
1654 }
1655
1656 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1657 {
1658     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1659     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1660     uint8_t *src0, *src1;
1661     int stride0 = 16, stride1 = 16;
1662
1663     x264_me_t m;
1664     int i_ref, i_mvc;
1665     ALIGNED_4( int16_t mvc[9][2] );
1666     int i_halfpel_thresh = INT_MAX;
1667     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1668
1669     /* 16x16 Search on all ref frame */
1670     m.i_pixel = PIXEL_16x16;
1671     m.weight = weight_none;
1672
1673     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1674
1675     /* ME for List 0 */
1676     a->l0.me16x16.cost = INT_MAX;
1677     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1678     {
1679         const int i_ref_cost = REF_COST( 0, i_ref );
1680         m.i_ref_cost = i_ref_cost;
1681         /* search with ref */
1682         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1683         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1684         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1685         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1686
1687         /* add ref cost */
1688         m.cost += i_ref_cost;
1689
1690         if( m.cost < a->l0.me16x16.cost )
1691         {
1692             a->l0.i_ref = i_ref;
1693             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1694         }
1695
1696         /* save mv for predicting neighbors */
1697         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1698     }
1699     a->l0.me16x16.i_ref = a->l0.i_ref;
1700
1701     /* ME for list 1 */
1702     i_halfpel_thresh = INT_MAX;
1703     p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1704     a->l1.me16x16.cost = INT_MAX;
1705     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1706     {
1707         const int i_ref_cost = REF_COST( 0, i_ref );
1708         m.i_ref_cost = i_ref_cost;
1709         /* search with ref */
1710         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1711         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1712         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1713         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1714
1715         /* add ref cost */
1716         m.cost += i_ref_cost;
1717
1718         if( m.cost < a->l1.me16x16.cost )
1719         {
1720             a->l1.i_ref = i_ref;
1721             h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1722         }
1723
1724         /* save mv for predicting neighbors */
1725         CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1726     }
1727     a->l1.me16x16.i_ref = a->l1.i_ref;
1728
1729     /* get cost of BI mode */
1730     int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
1731     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1732     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1733     src0 = h->mc.get_ref( pix0, &stride0,
1734                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1735                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1736     src1 = h->mc.get_ref( pix1, &stride1,
1737                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1738                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1739
1740     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1741
1742     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1743                      + ref_costs
1744                      + a->l0.bi16x16.cost_mv
1745                      + a->l1.bi16x16.cost_mv;
1746
1747
1748     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1749     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1750     {
1751         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1752                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1753         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1754                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1755         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
1756                                 h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
1757                                 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1758         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1759                    + ref_costs + l0_mv_cost + l1_mv_cost;
1760         if( cost00 < a->i_cost16x16bi )
1761         {
1762             M32( a->l0.bi16x16.mv ) = 0;
1763             M32( a->l1.bi16x16.mv ) = 0;
1764             a->l0.bi16x16.cost_mv = l0_mv_cost;
1765             a->l1.bi16x16.cost_mv = l1_mv_cost;
1766             a->i_cost16x16bi = cost00;
1767         }
1768     }
1769
1770     /* mb type cost */
1771     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1772     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1773     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1774 }
1775
1776 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1777 {
1778     const int x = 2*(i%2);
1779     const int y = 2*(i/2);
1780
1781     switch( h->mb.i_sub_partition[i] )
1782     {
1783         case D_L0_8x8:
1784             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1785             break;
1786         case D_L0_8x4:
1787             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1788             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1789             break;
1790         case D_L0_4x8:
1791             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1792             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1793             break;
1794         case D_L0_4x4:
1795             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1796             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1797             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1798             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1799             break;
1800         default:
1801             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1802             break;
1803     }
1804 }
1805
1806 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1807 {
1808     const int x = 2*(idx&1);
1809     const int y = 2*(idx>>1);
1810     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1811     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1812     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1813     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1814 }
1815
1816 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1817     if( x264_mb_partition_listX_table[0][part] ) \
1818     { \
1819         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1820         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1821     } \
1822     else \
1823     { \
1824         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1825         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1826         if( b_mvd ) \
1827             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1828     } \
1829     if( x264_mb_partition_listX_table[1][part] ) \
1830     { \
1831         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1832         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1833     } \
1834     else \
1835     { \
1836         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1837         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1838         if( b_mvd ) \
1839             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1840     }
1841
1842 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1843 {
1844     int x = (i%2)*2;
1845     int y = (i/2)*2;
1846     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1847     {
1848         x264_mb_load_mv_direct8x8( h, i );
1849         if( b_mvd )
1850         {
1851             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1852             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1853             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1854         }
1855     }
1856     else
1857     {
1858         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1859     }
1860 }
1861 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1862 {
1863     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1864 }
1865 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1866 {
1867     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1868 }
1869 #undef CACHE_MV_BI
1870
1871 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1872 {
1873     uint8_t **p_fref[2] =
1874         { h->mb.pic.p_fref[0][a->l0.i_ref],
1875           h->mb.pic.p_fref[1][a->l1.i_ref] };
1876     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1877     int i, l;
1878
1879     /* XXX Needed for x264_mb_predict_mv */
1880     h->mb.i_partition = D_8x8;
1881
1882     a->i_cost8x8bi = 0;
1883
1884     for( i = 0; i < 4; i++ )
1885     {
1886         const int x8 = i%2;
1887         const int y8 = i/2;
1888         int i_part_cost;
1889         int i_part_cost_bi = 0;
1890         int stride[2] = {8,8};
1891         uint8_t *src[2];
1892
1893         for( l = 0; l < 2; l++ )
1894         {
1895             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1896             const int i_ref_cost = REF_COST( l, lX->i_ref );
1897             x264_me_t *m = &lX->me8x8[i];
1898
1899             m->i_pixel = PIXEL_8x8;
1900             m->i_ref_cost = i_ref_cost;
1901
1902             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1903             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1904
1905             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1906             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1907             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1908             m->cost += i_ref_cost;
1909
1910             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1911
1912             /* BI mode */
1913             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1914                                     m->mv[0], m->mv[1], 8, 8, weight_none );
1915             i_part_cost_bi += m->cost_mv + i_ref_cost;
1916         }
1917         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1918         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1919                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1920         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1921         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1922
1923         i_part_cost = a->l0.me8x8[i].cost;
1924         h->mb.i_sub_partition[i] = D_L0_8x8;
1925         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1926         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1927         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1928         a->i_cost8x8bi += i_part_cost;
1929
1930         /* XXX Needed for x264_mb_predict_mv */
1931         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1932     }
1933
1934     /* mb type cost */
1935     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1936 }
1937
1938 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1939 {
1940     uint8_t **p_fref[2] =
1941         { h->mb.pic.p_fref[0][a->l0.i_ref],
1942           h->mb.pic.p_fref[1][a->l1.i_ref] };
1943     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1944     ALIGNED_4( int16_t mvc[2][2] );
1945     int i, l;
1946
1947     h->mb.i_partition = D_16x8;
1948     a->i_cost16x8bi = 0;
1949
1950     for( i = 0; i < 2; i++ )
1951     {
1952         int i_part_cost;
1953         int i_part_cost_bi = 0;
1954         int stride[2] = {16,16};
1955         uint8_t *src[2];
1956
1957         /* TODO: check only the list(s) that were used in b8x8? */
1958         for( l = 0; l < 2; l++ )
1959         {
1960             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1961             const int i_ref_cost = REF_COST( l, lX->i_ref );
1962             x264_me_t *m = &lX->me16x8[i];
1963
1964             m->i_pixel = PIXEL_16x8;
1965             m->i_ref_cost = i_ref_cost;
1966
1967             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1968             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1969
1970             CP32( mvc[0], lX->me8x8[2*i].mv );
1971             CP32( mvc[1], lX->me8x8[2*i+1].mv );
1972
1973             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1974             x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1975             x264_me_search( h, m, mvc, 2 );
1976             m->cost += i_ref_cost;
1977
1978             /* BI mode */
1979             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1980                                     m->mv[0], m->mv[1], 16, 8, weight_none );
1981             i_part_cost_bi += m->cost_mv + i_ref_cost;
1982         }
1983         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1984         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1985
1986         i_part_cost = a->l0.me16x8[i].cost;
1987         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1988         if( a->l1.me16x8[i].cost < i_part_cost )
1989         {
1990             i_part_cost = a->l1.me16x8[i].cost;
1991             a->i_mb_partition16x8[i] = D_L1_8x8;
1992         }
1993         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1994         {
1995             i_part_cost = i_part_cost_bi;
1996             a->i_mb_partition16x8[i] = D_BI_8x8;
1997         }
1998         a->i_cost16x8bi += i_part_cost;
1999
2000         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2001     }
2002
2003     /* mb type cost */
2004     a->i_mb_type16x8 = B_L0_L0
2005         + (a->i_mb_partition16x8[0]>>2) * 3
2006         + (a->i_mb_partition16x8[1]>>2);
2007     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2008 }
2009
2010 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2011 {
2012     uint8_t **p_fref[2] =
2013         { h->mb.pic.p_fref[0][a->l0.i_ref],
2014           h->mb.pic.p_fref[1][a->l1.i_ref] };
2015     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2016     ALIGNED_4( int16_t mvc[2][2] );
2017     int i, l;
2018
2019     h->mb.i_partition = D_8x16;
2020     a->i_cost8x16bi = 0;
2021
2022     for( i = 0; i < 2; i++ )
2023     {
2024         int i_part_cost;
2025         int i_part_cost_bi = 0;
2026         int stride[2] = {8,8};
2027         uint8_t *src[2];
2028
2029         for( l = 0; l < 2; l++ )
2030         {
2031             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2032             const int i_ref_cost = REF_COST( l, lX->i_ref );
2033             x264_me_t *m = &lX->me8x16[i];
2034
2035             m->i_pixel = PIXEL_8x16;
2036             m->i_ref_cost = i_ref_cost;
2037
2038             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2039             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2040
2041             CP32( mvc[0], lX->me8x8[i].mv );
2042             CP32( mvc[1], lX->me8x8[i+2].mv );
2043
2044             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2045             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2046             x264_me_search( h, m, mvc, 2 );
2047             m->cost += i_ref_cost;
2048
2049             /* BI mode */
2050             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
2051                                     m->mv[0], m->mv[1], 8, 16, weight_none );
2052             i_part_cost_bi += m->cost_mv + i_ref_cost;
2053         }
2054
2055         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2056         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2057
2058         i_part_cost = a->l0.me8x16[i].cost;
2059         a->i_mb_partition8x16[i] = D_L0_8x8;
2060         if( a->l1.me8x16[i].cost < i_part_cost )
2061         {
2062             i_part_cost = a->l1.me8x16[i].cost;
2063             a->i_mb_partition8x16[i] = D_L1_8x8;
2064         }
2065         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2066         {
2067             i_part_cost = i_part_cost_bi;
2068             a->i_mb_partition8x16[i] = D_BI_8x8;
2069         }
2070         a->i_cost8x16bi += i_part_cost;
2071
2072         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2073     }
2074
2075     /* mb type cost */
2076     a->i_mb_type8x16 = B_L0_L0
2077         + (a->i_mb_partition8x16[0]>>2) * 3
2078         + (a->i_mb_partition8x16[1]>>2);
2079     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2080 }
2081
2082 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2083 {
2084     int thresh = i_satd * 5/4;
2085
2086     h->mb.i_type = P_L0;
2087     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2088     {
2089         h->mb.i_partition = D_16x16;
2090         x264_analyse_update_cache( h, a );
2091         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2092     }
2093
2094     if( a->l0.i_cost16x8 <= thresh )
2095     {
2096         h->mb.i_partition = D_16x8;
2097         x264_analyse_update_cache( h, a );
2098         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2099     }
2100     else
2101         a->l0.i_cost16x8 = COST_MAX;
2102
2103     if( a->l0.i_cost8x16 <= thresh )
2104     {
2105         h->mb.i_partition = D_8x16;
2106         x264_analyse_update_cache( h, a );
2107         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2108     }
2109     else
2110         a->l0.i_cost8x16 = COST_MAX;
2111
2112     if( a->l0.i_cost8x8 <= thresh )
2113     {
2114         h->mb.i_type = P_8x8;
2115         h->mb.i_partition = D_8x8;
2116         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2117         {
2118             int i;
2119             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2120             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2121             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2122             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2123             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2124              * for future blocks are those left over from previous RDO calls. */
2125             for( i = 0; i < 4; i++ )
2126             {
2127                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2128                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2129                 int subtype, btype = D_L0_8x8;
2130                 uint64_t bcost = COST_MAX64;
2131                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2132                 {
2133                     uint64_t cost;
2134                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2135                         continue;
2136                     h->mb.i_sub_partition[i] = subtype;
2137                     x264_mb_cache_mv_p8x8( h, a, i );
2138                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2139                     COPY2_IF_LT( bcost, cost, btype, subtype );
2140                 }
2141                 if( h->mb.i_sub_partition[i] != btype )
2142                 {
2143                     h->mb.i_sub_partition[i] = btype;
2144                     x264_mb_cache_mv_p8x8( h, a, i );
2145                 }
2146             }
2147         }
2148         else
2149             x264_analyse_update_cache( h, a );
2150         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2151     }
2152     else
2153         a->l0.i_cost8x8 = COST_MAX;
2154 }
2155
2156 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2157 {
2158     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2159
2160     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2161     {
2162         h->mb.i_type = B_DIRECT;
2163         /* Assumes direct/skip MC is still in fdec */
2164         /* Requires b-rdo to be done before intra analysis */
2165         h->mb.b_skip_mc = 1;
2166         x264_analyse_update_cache( h, a );
2167         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2168         h->mb.b_skip_mc = 0;
2169     }
2170
2171     //FIXME not all the update_cache calls are needed
2172     h->mb.i_partition = D_16x16;
2173     /* L0 */
2174     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2175     {
2176         h->mb.i_type = B_L0_L0;
2177         x264_analyse_update_cache( h, a );
2178         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2179     }
2180
2181     /* L1 */
2182     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2183     {
2184         h->mb.i_type = B_L1_L1;
2185         x264_analyse_update_cache( h, a );
2186         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2187     }
2188
2189     /* BI */
2190     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2191     {
2192         h->mb.i_type = B_BI_BI;
2193         x264_analyse_update_cache( h, a );
2194         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2195     }
2196
2197     /* 8x8 */
2198     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2199     {
2200         h->mb.i_type = B_8x8;
2201         h->mb.i_partition = D_8x8;
2202         x264_analyse_update_cache( h, a );
2203         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2204         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2205     }
2206
2207     /* 16x8 */
2208     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2209     {
2210         h->mb.i_type = a->i_mb_type16x8;
2211         h->mb.i_partition = D_16x8;
2212         x264_analyse_update_cache( h, a );
2213         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2214     }
2215
2216     /* 8x16 */
2217     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2218     {
2219         h->mb.i_type = a->i_mb_type8x16;
2220         h->mb.i_partition = D_8x16;
2221         x264_analyse_update_cache( h, a );
2222         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2223     }
2224 }
2225
2226 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2227 {
2228     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2229     int i;
2230
2231     if( IS_INTRA(h->mb.i_type) )
2232         return;
2233
2234     switch( h->mb.i_partition )
2235     {
2236         case D_16x16:
2237             if( h->mb.i_type == B_BI_BI )
2238                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2239             break;
2240         case D_16x8:
2241             for( i=0; i<2; i++ )
2242                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2243                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2244             break;
2245         case D_8x16:
2246             for( i=0; i<2; i++ )
2247                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2248                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2249             break;
2250         case D_8x8:
2251             for( i=0; i<4; i++ )
2252                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2253                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2254             break;
2255     }
2256 }
2257
2258 static inline void x264_mb_analyse_transform( x264_t *h )
2259 {
2260     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2261     {
2262         int i_cost4, i_cost8;
2263         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2264         x264_mb_mc( h );
2265
2266         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2267                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2268         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2269                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2270
2271         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2272         h->mb.b_skip_mc = 1;
2273     }
2274 }
2275
2276 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2277 {
2278     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2279     {
2280         int i_rd8;
2281         x264_analyse_update_cache( h, a );
2282         h->mb.b_transform_8x8 ^= 1;
2283         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2284         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2285
2286         if( *i_rd >= i_rd8 )
2287         {
2288             if( *i_rd > 0 )
2289                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2290             *i_rd = i_rd8;
2291         }
2292         else
2293             h->mb.b_transform_8x8 ^= 1;
2294     }
2295 }
2296
2297 /* Rate-distortion optimal QP selection.
2298  * FIXME: More than half of the benefit of this function seems to be
2299  * in the way it improves the coding of chroma DC (by decimating or
2300  * finding a better way to code a single DC coefficient.)
2301  * There must be a more efficient way to get that portion of the benefit
2302  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2303  * trick. */
2304 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2305 {
2306     int bcost, cost, direction, failures, prevcost, origcost;
2307     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2308     int last_qp_tried = 0;
2309     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2310     int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2311
2312     /* If CBP is already zero, don't raise the quantizer any higher. */
2313     for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2314     {
2315         /* Without psy-RD, require monotonicity when moving quant away from previous
2316          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2317          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2318          * allow 2 failures when moving quant towards previous quant.
2319          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2320         int threshold = (!!h->mb.i_psy_rd);
2321         /* Raise the threshold for failures if we're moving towards the last QP. */
2322         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2323             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2324             threshold++;
2325         h->mb.i_qp = orig_qp;
2326         failures = 0;
2327         prevcost = origcost;
2328
2329         /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2330          * (up to a point) will too.  So, jump down to where the threshold will kick in
2331          * and check the QP there.  If the CBP is still empty, skip the main loop.
2332          * If it isn't empty, we would have ended up having to check this QP anyways,
2333          * so as long as we store it for later lookup, we lose nothing. */
2334         int already_checked_qp = -1;
2335         int already_checked_cost = COST_MAX;
2336         if( direction == -1 )
2337         {
2338             if( !origcbp )
2339             {
2340                 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2341                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2342                 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2343                 if( !h->mb.cbp[h->mb.i_mb_xy] )
2344                 {
2345                     /* If our empty-CBP block is lower QP than the last QP,
2346                      * the last QP almost surely doesn't have a CBP either. */
2347                     if( h->mb.i_last_qp > h->mb.i_qp )
2348                         last_qp_tried = 1;
2349                     break;
2350                 }
2351                 already_checked_qp = h->mb.i_qp;
2352                 h->mb.i_qp = orig_qp;
2353             }
2354         }
2355
2356         h->mb.i_qp += direction;
2357         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2358         {
2359             if( h->mb.i_last_qp == h->mb.i_qp )
2360                 last_qp_tried = 1;
2361             if( h->mb.i_qp == already_checked_qp )
2362                 cost = already_checked_cost;
2363             else
2364             {
2365                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2366                 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2367                 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2368             }
2369
2370             /* We can't assume that the costs are monotonic over QPs.
2371              * Tie case-as-failure seems to give better results. */
2372             if( cost < prevcost )
2373                 failures = 0;
2374             else
2375                 failures++;
2376             prevcost = cost;
2377
2378             if( failures > threshold )
2379                 break;
2380             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2381                 break;
2382             h->mb.i_qp += direction;
2383         }
2384     }
2385
2386     /* Always try the last block's QP. */
2387     if( !last_qp_tried )
2388     {
2389         h->mb.i_qp = h->mb.i_last_qp;
2390         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2391         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2392         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2393     }
2394
2395     h->mb.i_qp = bqp;
2396     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2397
2398     /* Check transform again; decision from before may no longer be optimal. */
2399     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2400         x264_mb_transform_8x8_allowed( h ) )
2401     {
2402         h->mb.b_transform_8x8 ^= 1;
2403         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2404         if( cost > bcost )
2405             h->mb.b_transform_8x8 ^= 1;
2406     }
2407 }
2408
2409 /*****************************************************************************
2410  * x264_macroblock_analyse:
2411  *****************************************************************************/
2412 void x264_macroblock_analyse( x264_t *h )
2413 {
2414     x264_mb_analysis_t analysis;
2415     int i_cost = COST_MAX;
2416     int i;
2417
2418     h->mb.i_qp = x264_ratecontrol_qp( h );
2419     if( h->param.rc.i_aq_mode )
2420     {
2421         x264_adaptive_quant( h );
2422         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2423          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2424         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2425             h->mb.i_qp = h->mb.i_last_qp;
2426     }
2427
2428     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2429
2430     /*--------------------------- Do the analysis ---------------------------*/
2431     if( h->sh.i_type == SLICE_TYPE_I )
2432     {
2433 intra_analysis:
2434         if( analysis.i_mbrd )
2435             x264_mb_cache_fenc_satd( h );
2436         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2437         if( analysis.i_mbrd )
2438             x264_intra_rd( h, &analysis, COST_MAX );
2439
2440         i_cost = analysis.i_satd_i16x16;
2441         h->mb.i_type = I_16x16;
2442         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2443         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2444         if( analysis.i_satd_pcm < i_cost )
2445             h->mb.i_type = I_PCM;
2446
2447         else if( analysis.i_mbrd >= 2 )
2448             x264_intra_rd_refine( h, &analysis );
2449     }
2450     else if( h->sh.i_type == SLICE_TYPE_P )
2451     {
2452         int b_skip = 0;
2453
2454         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2455
2456         analysis.b_try_pskip = 0;
2457         if( analysis.b_force_intra )
2458         {
2459             if( !h->param.analyse.b_psy )
2460             {
2461                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2462                 goto intra_analysis;
2463             }
2464         }
2465         else
2466         {
2467             /* Fast P_SKIP detection */
2468             if( h->param.analyse.b_fast_pskip )
2469             {
2470                 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2471                     // FIXME don't need to check this if the reference frame is done
2472                     {}
2473                 else if( h->param.analyse.i_subpel_refine >= 3 )
2474                     analysis.b_try_pskip = 1;
2475                 else if( h->mb.i_mb_type_left == P_SKIP ||
2476                          h->mb.i_mb_type_top == P_SKIP ||
2477                          h->mb.i_mb_type_topleft == P_SKIP ||
2478                          h->mb.i_mb_type_topright == P_SKIP )
2479                     b_skip = x264_macroblock_probe_pskip( h );
2480             }
2481         }
2482
2483         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2484
2485         if( b_skip )
2486         {
2487             h->mb.i_type = P_SKIP;
2488             h->mb.i_partition = D_16x16;
2489             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2490         }
2491         else
2492         {
2493             const unsigned int flags = h->param.analyse.inter;
2494             int i_type;
2495             int i_partition;
2496             int i_thresh16x8;
2497             int i_satd_inter, i_satd_intra;
2498
2499             x264_mb_analyse_load_costs( h, &analysis );
2500
2501             x264_mb_analyse_inter_p16x16( h, &analysis );
2502
2503             if( h->mb.i_type == P_SKIP )
2504                 return;
2505
2506             if( flags & X264_ANALYSE_PSUB16x16 )
2507             {
2508                 if( h->param.analyse.b_mixed_references )
2509                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2510                 else
2511                     x264_mb_analyse_inter_p8x8( h, &analysis );
2512             }
2513
2514             /* Select best inter mode */
2515             i_type = P_L0;
2516             i_partition = D_16x16;
2517             i_cost = analysis.l0.me16x16.cost;
2518
2519             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2520                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2521             {
2522                 i_type = P_8x8;
2523                 i_partition = D_8x8;
2524                 i_cost = analysis.l0.i_cost8x8;
2525
2526                 /* Do sub 8x8 */
2527                 if( flags & X264_ANALYSE_PSUB8x8 )
2528                 {
2529                     for( i = 0; i < 4; i++ )
2530                     {
2531                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2532                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2533                         {
2534                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2535                             h->mb.i_sub_partition[i] = D_L0_4x4;
2536
2537                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2538                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2539                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2540
2541                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2542                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2543                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2544
2545                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2546                         }
2547                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2548                     }
2549                     analysis.l0.i_cost8x8 = i_cost;
2550                 }
2551             }
2552
2553             /* Now do 16x8/8x16 */
2554             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2555             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2556                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2557             {
2558                 x264_mb_analyse_inter_p16x8( h, &analysis );
2559                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2560
2561                 x264_mb_analyse_inter_p8x16( h, &analysis );
2562                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2563             }
2564
2565             h->mb.i_partition = i_partition;
2566
2567             /* refine qpel */
2568             //FIXME mb_type costs?
2569             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2570             {
2571                 /* refine later */
2572             }
2573             else if( i_partition == D_16x16 )
2574             {
2575                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2576                 i_cost = analysis.l0.me16x16.cost;
2577             }
2578             else if( i_partition == D_16x8 )
2579             {
2580                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2581                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2582                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2583             }
2584             else if( i_partition == D_8x16 )
2585             {
2586                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2587                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2588                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2589             }
2590             else if( i_partition == D_8x8 )
2591             {
2592                 int i8x8;
2593                 i_cost = 0;
2594                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2595                 {
2596                     switch( h->mb.i_sub_partition[i8x8] )
2597                     {
2598                         case D_L0_8x8:
2599                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2600                             i_cost += analysis.l0.me8x8[i8x8].cost;
2601                             break;
2602                         case D_L0_8x4:
2603                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2604                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2605                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2606                                       analysis.l0.me8x4[i8x8][1].cost;
2607                             break;
2608                         case D_L0_4x8:
2609                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2610                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2611                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2612                                       analysis.l0.me4x8[i8x8][1].cost;
2613                             break;
2614
2615                         case D_L0_4x4:
2616                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2617                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2618                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2619                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2620                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2621                                       analysis.l0.me4x4[i8x8][1].cost +
2622                                       analysis.l0.me4x4[i8x8][2].cost +
2623                                       analysis.l0.me4x4[i8x8][3].cost;
2624                             break;
2625                         default:
2626                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2627                             break;
2628                     }
2629                 }
2630             }
2631
2632             if( h->mb.b_chroma_me )
2633             {
2634                 x264_mb_analyse_intra_chroma( h, &analysis );
2635                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2636                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2637                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2638                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2639             }
2640             else
2641                 x264_mb_analyse_intra( h, &analysis, i_cost );
2642
2643             i_satd_inter = i_cost;
2644             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2645                                       analysis.i_satd_i8x8,
2646                                       analysis.i_satd_i4x4 );
2647
2648             if( analysis.i_mbrd )
2649             {
2650                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2651                 i_type = P_L0;
2652                 i_partition = D_16x16;
2653                 i_cost = analysis.l0.i_rd16x16;
2654                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2655                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2656                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2657                 h->mb.i_type = i_type;
2658                 h->mb.i_partition = i_partition;
2659                 if( i_cost < COST_MAX )
2660                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2661                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2662             }
2663
2664             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2665             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2666             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2667             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2668
2669             h->mb.i_type = i_type;
2670
2671             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2672             {
2673                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2674                  * it was an inter block. */
2675                 x264_analyse_update_cache( h, &analysis );
2676                 x264_macroblock_encode( h );
2677                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2678                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2679                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2680                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2681                 goto intra_analysis;
2682             }
2683
2684             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2685             {
2686                 if( IS_INTRA( h->mb.i_type ) )
2687                 {
2688                     x264_intra_rd_refine( h, &analysis );
2689                 }
2690                 else if( i_partition == D_16x16 )
2691                 {
2692                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2693                     analysis.l0.me16x16.cost = i_cost;
2694                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2695                 }
2696                 else if( i_partition == D_16x8 )
2697                 {
2698                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2699                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2700                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2701                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2702                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2703                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2704                 }
2705                 else if( i_partition == D_8x16 )
2706                 {
2707                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2708                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2709                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2710                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2711                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2712                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2713                 }
2714                 else if( i_partition == D_8x8 )
2715                 {
2716                     int i8x8;
2717                     x264_analyse_update_cache( h, &analysis );
2718                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2719                     {
2720                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2721                         {
2722                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2723                         }
2724                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2725                         {
2726                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2727                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2728                         }
2729                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2730                         {
2731                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2732                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2733                         }
2734                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2735                         {
2736                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2737                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2738                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2739                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2740                         }
2741                     }
2742                 }
2743             }
2744         }
2745     }
2746     else if( h->sh.i_type == SLICE_TYPE_B )
2747     {
2748         int i_bskip_cost = COST_MAX;
2749         int b_skip = 0;
2750
2751         if( analysis.i_mbrd )
2752             x264_mb_cache_fenc_satd( h );
2753
2754         h->mb.i_type = B_SKIP;
2755         if( h->mb.b_direct_auto_write )
2756         {
2757             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2758             for( i = 0; i < 2; i++ )
2759             {
2760                 int b_changed = 1;
2761                 h->sh.b_direct_spatial_mv_pred ^= 1;
2762                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2763                 if( analysis.b_direct_available )
2764                 {
2765                     if( b_changed )
2766                     {
2767                         x264_mb_mc( h );
2768                         b_skip = x264_macroblock_probe_bskip( h );
2769                     }
2770                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2771                 }
2772                 else
2773                     b_skip = 0;
2774             }
2775         }
2776         else
2777             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2778
2779         if( analysis.b_direct_available )
2780         {
2781             if( !h->mb.b_direct_auto_write )
2782                 x264_mb_mc( h );
2783             if( analysis.i_mbrd )
2784             {
2785                 i_bskip_cost = ssd_mb( h );
2786                 /* 6 = minimum cavlc cost of a non-skipped MB */
2787                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2788             }
2789             else if( !h->mb.b_direct_auto_write )
2790             {
2791                 /* Conditioning the probe on neighboring block types
2792                  * doesn't seem to help speed or quality. */
2793                 b_skip = x264_macroblock_probe_bskip( h );
2794             }
2795         }
2796
2797         if( !b_skip )
2798         {
2799             const unsigned int flags = h->param.analyse.inter;
2800             int i_type;
2801             int i_partition;
2802             int i_satd_inter;
2803             h->mb.b_skip_mc = 0;
2804
2805             x264_mb_analyse_load_costs( h, &analysis );
2806
2807             /* select best inter mode */
2808             /* direct must be first */
2809             if( analysis.b_direct_available )
2810                 x264_mb_analyse_inter_direct( h, &analysis );
2811
2812             x264_mb_analyse_inter_b16x16( h, &analysis );
2813
2814             i_type = B_L0_L0;
2815             i_partition = D_16x16;
2816             i_cost = analysis.l0.me16x16.cost;
2817             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2818             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2819             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2820
2821             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2822             {
2823                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2824                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2825                     i_bskip_cost < analysis.i_rd16x16bi &&
2826                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2827                     i_bskip_cost < analysis.l1.i_rd16x16 )
2828                 {
2829                     h->mb.i_type = B_SKIP;
2830                     x264_analyse_update_cache( h, &analysis );
2831                     return;
2832                 }
2833             }
2834
2835             if( flags & X264_ANALYSE_BSUB16x16 )
2836             {
2837                 x264_mb_analyse_inter_b8x8( h, &analysis );
2838                 if( analysis.i_cost8x8bi < i_cost )
2839                 {
2840                     i_type = B_8x8;
2841                     i_partition = D_8x8;
2842                     i_cost = analysis.i_cost8x8bi;
2843
2844                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2845                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2846                     {
2847                         x264_mb_analyse_inter_b16x8( h, &analysis );
2848                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2849                                      i_type, analysis.i_mb_type16x8,
2850                                      i_partition, D_16x8 );
2851                     }
2852                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2853                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2854                     {
2855                         x264_mb_analyse_inter_b8x16( h, &analysis );
2856                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2857                                      i_type, analysis.i_mb_type8x16,
2858                                      i_partition, D_8x16 );
2859                     }
2860                 }
2861             }
2862
2863             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2864             {
2865                 /* refine later */
2866             }
2867             /* refine qpel */
2868             else if( i_partition == D_16x16 )
2869             {
2870                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2871                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2872                 if( i_type == B_L0_L0 )
2873                 {
2874                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2875                     i_cost = analysis.l0.me16x16.cost
2876                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2877                 }
2878                 else if( i_type == B_L1_L1 )
2879                 {
2880                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2881                     i_cost = analysis.l1.me16x16.cost
2882                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2883                 }
2884                 else if( i_type == B_BI_BI )
2885                 {
2886                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2887                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2888                 }
2889             }
2890             else if( i_partition == D_16x8 )
2891             {
2892                 for( i=0; i<2; i++ )
2893                 {
2894                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2895                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2896                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2897                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2898                 }
2899             }
2900             else if( i_partition == D_8x16 )
2901             {
2902                 for( i=0; i<2; i++ )
2903                 {
2904                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2905                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2906                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2907                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2908                 }
2909             }
2910             else if( i_partition == D_8x8 )
2911             {
2912                 for( i=0; i<4; i++ )
2913                 {
2914                     x264_me_t *m;
2915                     int i_part_cost_old;
2916                     int i_type_cost;
2917                     int i_part_type = h->mb.i_sub_partition[i];
2918                     int b_bidir = (i_part_type == D_BI_8x8);
2919
2920                     if( i_part_type == D_DIRECT_8x8 )
2921                         continue;
2922                     if( x264_mb_partition_listX_table[0][i_part_type] )
2923                     {
2924                         m = &analysis.l0.me8x8[i];
2925                         i_part_cost_old = m->cost;
2926                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2927                         m->cost -= i_type_cost;
2928                         x264_me_refine_qpel( h, m );
2929                         if( !b_bidir )
2930                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2931                     }
2932                     if( x264_mb_partition_listX_table[1][i_part_type] )
2933                     {
2934                         m = &analysis.l1.me8x8[i];
2935                         i_part_cost_old = m->cost;
2936                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2937                         m->cost -= i_type_cost;
2938                         x264_me_refine_qpel( h, m );
2939                         if( !b_bidir )
2940                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2941                     }
2942                     /* TODO: update mvp? */
2943                 }
2944             }
2945
2946             i_satd_inter = i_cost;
2947
2948             if( analysis.i_mbrd )
2949             {
2950                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2951                 i_type = B_SKIP;
2952                 i_cost = i_bskip_cost;
2953                 i_partition = D_16x16;
2954                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2955                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2956                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2957                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2958                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2959                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2960                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2961
2962                 h->mb.i_type = i_type;
2963                 h->mb.i_partition = i_partition;
2964             }
2965
2966             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2967
2968             if( analysis.i_mbrd )
2969             {
2970                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2971                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2972             }
2973
2974             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2975             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2976             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2977             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2978
2979             h->mb.i_type = i_type;
2980             h->mb.i_partition = i_partition;
2981
2982             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2983                 x264_intra_rd_refine( h, &analysis );
2984             if( h->mb.i_subpel_refine >= 5 )
2985                 x264_refine_bidir( h, &analysis );
2986
2987             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2988             {
2989                 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2990                 x264_analyse_update_cache( h, &analysis );
2991
2992                 if( i_partition == D_16x16 )
2993                 {
2994                     if( i_type == B_L0_L0 )
2995                     {
2996                         analysis.l0.me16x16.cost = i_cost;
2997                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2998                     }
2999                     else if( i_type == B_L1_L1 )
3000                     {
3001                         analysis.l1.me16x16.cost = i_cost;
3002                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3003                     }
3004                     else if( i_type == B_BI_BI )
3005                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3006                 }
3007                 else if( i_partition == D_16x8 )
3008                 {
3009                     for( i = 0; i < 2; i++ )
3010                     {
3011                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3012                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3013                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3014                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3015                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3016                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3017                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3018                     }
3019                 }
3020                 else if( i_partition == D_8x16 )
3021                 {
3022                     for( i = 0; i < 2; i++ )
3023                     {
3024                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3025                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3026                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3027                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3028                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3029                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3030                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3031                     }
3032                 }
3033                 else if( i_partition == D_8x8 )
3034                 {
3035                     for( i = 0; i < 4; i++ )
3036                     {
3037                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3038                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3039                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3040                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3041                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3042                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3043                     }
3044                 }
3045             }
3046         }
3047     }
3048
3049     x264_analyse_update_cache( h, &analysis );
3050
3051     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3052      * without realizing it.  Check for this and account for it if necessary. */
3053     if( analysis.i_mbrd >= 2 )
3054     {
3055         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3056         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3057         int list = check_mv_lists[h->mb.i_type] - 1;
3058         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3059             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3060             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3061                 h->mb.i_partition = D_16x16;
3062     }
3063
3064     if( !analysis.i_mbrd )
3065         x264_mb_analyse_transform( h );
3066
3067     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3068         x264_mb_analyse_qp_rd( h, &analysis );
3069
3070     h->mb.b_trellis = h->param.analyse.i_trellis;
3071     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3072     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3073         x264_psy_trellis_init( h, 0 );
3074     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3075         h->mb.i_skip_intra = 0;
3076 }
3077
3078 /*-------------------- Update MB from the analysis ----------------------*/
3079 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3080 {
3081     int i;
3082
3083     switch( h->mb.i_type )
3084     {
3085         case I_4x4:
3086             for( i = 0; i < 16; i++ )
3087                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3088
3089             x264_mb_analyse_intra_chroma( h, a );
3090             break;
3091         case I_8x8:
3092             for( i = 0; i < 4; i++ )
3093                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3094
3095             x264_mb_analyse_intra_chroma( h, a );
3096             break;
3097         case I_16x16:
3098             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3099             x264_mb_analyse_intra_chroma( h, a );
3100             break;
3101
3102         case I_PCM:
3103             break;
3104
3105         case P_L0:
3106             switch( h->mb.i_partition )
3107             {
3108                 case D_16x16:
3109                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3110                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3111                     break;
3112
3113                 case D_16x8:
3114                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3115                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3116                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3117                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3118                     break;
3119
3120                 case D_8x16:
3121                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3122                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3123                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3124                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3125                     break;
3126
3127                 default:
3128                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3129                     break;
3130             }
3131             break;
3132
3133         case P_8x8:
3134             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3135             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3136             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3137             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3138             for( i = 0; i < 4; i++ )
3139                 x264_mb_cache_mv_p8x8( h, a, i );
3140             break;
3141
3142         case P_SKIP:
3143         {
3144             h->mb.i_partition = D_16x16;
3145             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3146             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3147             break;
3148         }
3149
3150         case B_SKIP:
3151         case B_DIRECT:
3152             h->mb.i_partition = h->mb.cache.direct_partition;
3153             x264_mb_load_mv_direct8x8( h, 0 );
3154             x264_mb_load_mv_direct8x8( h, 1 );
3155             x264_mb_load_mv_direct8x8( h, 2 );
3156             x264_mb_load_mv_direct8x8( h, 3 );
3157             break;
3158
3159         case B_8x8:
3160             /* optimize: cache might not need to be rewritten */
3161             for( i = 0; i < 4; i++ )
3162                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3163             break;
3164
3165         default: /* the rest of the B types */
3166             switch( h->mb.i_partition )
3167             {
3168             case D_16x16:
3169                 switch( h->mb.i_type )
3170                 {
3171                 case B_L0_L0:
3172                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3173                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3174
3175                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3176                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3177                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3178                     break;
3179                 case B_L1_L1:
3180                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3181                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3182                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3183
3184                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3185                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3186                     break;
3187                 case B_BI_BI:
3188                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3189                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3190
3191                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3192                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3193                     break;
3194                 }
3195                 break;
3196             case D_16x8:
3197                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3198                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3199                 break;
3200             case D_8x16:
3201                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3202                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3203                 break;
3204             default:
3205                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3206                 break;
3207             }
3208     }
3209
3210 #ifndef NDEBUG
3211     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3212     {
3213         int l;
3214         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3215         {
3216             int completed;
3217             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3218             if( ref < 0 )
3219                 continue;
3220             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3221             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3222             {
3223                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3224                 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3225                 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3226                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3227                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3228                 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3229                 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3230                 fprintf(stderr, "completed: %d \n", completed );
3231                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3232                 x264_mb_analyse_intra( h, a, COST_MAX );
3233                 h->mb.i_type = I_16x16;
3234                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3235                 x264_mb_analyse_intra_chroma( h, a );
3236             }
3237         }
3238     }
3239 #endif
3240 }
3241
3242 #include "slicetype.c"
3243