git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int i_ref;
  41     int       i_rd16x16;
  42     x264_me_t me16x16;
  43     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  44
  45     /* 8x8 */
  46     int       i_cost8x8;
  47     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  48     ALIGNED_4( int16_t mvc[32][5][2] );
  49     x264_me_t me8x8[4];
  50
  51     /* Sub 4x4 */
  52     int       i_cost4x4[4]; /* cost per 8x8 partition */
  53     x264_me_t me4x4[4][4];
  54
  55     /* Sub 8x4 */
  56     int       i_cost8x4[4]; /* cost per 8x8 partition */
  57     x264_me_t me8x4[4][2];
  58
  59     /* Sub 4x8 */
  60     int       i_cost4x8[4]; /* cost per 8x8 partition */
  61     x264_me_t me4x8[4][2];
  62
  63     /* 16x8 */
  64     int       i_cost16x8;
  65     x264_me_t me16x8[2];
  66
  67     /* 8x16 */
  68     int       i_cost8x16;
  69     x264_me_t me8x16[2];
  70
  71 } x264_mb_analysis_list_t;
  72
  73 typedef struct
  74 {
  75     /* conduct the analysis using this lamda and QP */
  76     int i_lambda;
  77     int i_lambda2;
  78     int i_qp;
  79     uint16_t *p_cost_mv;
  80     uint16_t *p_cost_ref[2];
  81     int i_mbrd;
  82
  83
  84     /* I: Intra part */
  85     /* Take some shortcuts in intra search if intra is deemed unlikely */
  86     int b_fast_intra;
  87     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  88     int b_try_pskip;
  89
  90     /* Luma part */
  91     int i_satd_i16x16;
  92     int i_satd_i16x16_dir[7];
  93     int i_predict16x16;
  94
  95     int i_satd_i8x8;
  96     int i_cbp_i8x8_luma;
  97     int i_satd_i8x8_dir[12][4];
  98     int i_predict8x8[4];
  99
 100     int i_satd_i4x4;
 101     int i_predict4x4[16];
 102
 103     int i_satd_pcm;
 104
 105     /* Chroma part */
 106     int i_satd_i8x8chroma;
 107     int i_satd_i8x8chroma_dir[7];
 108     int i_predict8x8chroma;
 109
 110     /* II: Inter part P/B frame */
 111     x264_mb_analysis_list_t l0;
 112     x264_mb_analysis_list_t l1;
 113
 114     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 115     int i_cost16x16direct;
 116     int i_cost8x8bi;
 117     int i_cost8x8direct[4];
 118     int i_cost16x8bi;
 119     int i_cost8x16bi;
 120     int i_rd16x16bi;
 121     int i_rd16x16direct;
 122     int i_rd16x8bi;
 123     int i_rd8x16bi;
 124     int i_rd8x8bi;
 125
 126     int i_mb_partition16x8[2]; /* mb_partition_e */
 127     int i_mb_partition8x16[2];
 128     int i_mb_type16x8; /* mb_class_e */
 129     int i_mb_type8x16;
 130
 131     int b_direct_available;
 132
 133 } x264_mb_analysis_t;
 134
 135 /* lambda = pow(2,qp/6-2) */
 136 const uint8_t x264_lambda_tab[52] = {
 137    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 138    1, 1, 1, 1,              /*  8-11 */
 139    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 140    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 141    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 142   16,18,20,23,25,29,32,36,  /* 36-43 */
 143   40,45,51,57,64,72,81,91   /* 44-51 */
 144 };
 145
 146 /* lambda2 = pow(lambda,2) * .9 * 256 */
 147 const int x264_lambda2_tab[52] = {
 148     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 149     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 150    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 151   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 152  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 153 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 154 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 155 };
 156
 157 const uint8_t x264_exp2_lut[64] = {
 158       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 159      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 160     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 161     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 162 };
 163
 164 const float x264_log2_lut[128] = {
 165     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 166     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 167     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 168     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 169     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 170     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 171     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 172     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 173     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 174     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 175     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 176     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 177     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 178     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 179     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 180     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 181 };
 182
 183 /* Avoid an int/float conversion. */
 184 const float x264_log2_lz_lut[32] = {
 185     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 186 };
 187
 188 // should the intra and inter lambdas be different?
 189 // I'm just matching the behaviour of deadzone quant.
 190 static const int x264_trellis_lambda2_tab[2][52] = {
 191     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 192     {    46,      58,      73,      92,     117,     147,
 193         185,     233,     294,     370,     466,     587,
 194         740,     932,    1174,    1480,    1864,    2349,
 195        2959,    3728,    4697,    5918,    7457,    9395,
 196       11837,   14914,   18790,   23674,   29828,   37581,
 197       47349,   59656,   75163,   94699,  119313,  150326,
 198      189399,  238627,  300652,  378798,  477255,  601304,
 199      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 200     3030384, 3818045, 4810435, 6060769 },
 201     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 202     {    27,      34,      43,      54,      68,      86,
 203         108,     136,     172,     216,     273,     343,
 204         433,     545,     687,     865,    1090,    1374,
 205        1731,    2180,    2747,    3461,    4361,    5494,
 206        6922,    8721,   10988,   13844,   17442,   21976,
 207       27688,   34885,   43953,   55377,   69771,   87906,
 208      110755,  139543,  175813,  221511,  279087,  351627,
 209      443023,  558174,  703255,  886046, 1116348, 1406511,
 210     1772093, 2232697, 2813022, 3544186 }
 211 };
 212
 213 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 214        16,    20,    25,    32,    40,    50,
 215        64,    80,   101,   128,   161,   203,
 216       256,   322,   406,   512,   645,   812,
 217      1024,  1290,  1625,  2048,  2580,  3250,
 218      4096,  5160,  6501,  8192, 10321, 13003,
 219     16384, 20642, 26007, 32768, 41285, 52015,
 220     65535
 221 };
 222
 223 /* TODO: calculate CABAC costs */
 224 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 225     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 226 };
 227 static const uint8_t i_mb_b16x8_cost_table[17] = {
 228     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 229 };
 230 static const uint8_t i_sub_mb_b_cost_table[13] = {
 231     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 232 };
 233 static const uint8_t i_sub_mb_p_cost_table[4] = {
 234     5, 3, 3, 1
 235 };
 236
 237 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 238
 239 static uint16_t x264_cost_ref[92][3][33];
 240 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 241
 242 int x264_analyse_init_costs( x264_t *h, int qp )
 243 {
 244     int i, j;
 245     int lambda = x264_lambda_tab[qp];
 246     if( h->cost_mv[lambda] )
 247         return 0;
 248     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 249     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 250     h->cost_mv[lambda] += 2*4*2048;
 251     for( i = 0; i <= 2*4*2048; i++ )
 252     {
 253         h->cost_mv[lambda][-i] =
 254         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 255     }
 256     x264_pthread_mutex_lock( &cost_ref_mutex );
 257     for( i = 0; i < 3; i++ )
 258         for( j = 0; j < 33; j++ )
 259             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 260     x264_pthread_mutex_unlock( &cost_ref_mutex );
 261     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 262     {
 263         for( j=0; j<4; j++ )
 264         {
 265             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 266             h->cost_mv_fpel[lambda][j] += 2*2048;
 267             for( i = -2*2048; i < 2*2048; i++ )
 268                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 269         }
 270     }
 271     return 0;
 272 fail:
 273     return -1;
 274 }
 275
 276 void x264_analyse_free_costs( x264_t *h )
 277 {
 278     int i, j;
 279     for( i = 0; i < 92; i++ )
 280     {
 281         if( h->cost_mv[i] )
 282             x264_free( h->cost_mv[i] - 2*4*2048 );
 283         if( h->cost_mv_fpel[i][0] )
 284             for( j = 0; j < 4; j++ )
 285                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 286     }
 287 }
 288
 289 void x264_analyse_weight_frame( x264_t *h, int end )
 290 {
 291     int j;
 292     for( j=0; j<h->i_ref0; j++ )
 293     {
 294         if( h->sh.weight[j][0].weightfn )
 295         {
 296             x264_frame_t *frame = h->fref0[j];
 297             int width = frame->i_width[0] + 2*PADH;
 298             int i_padv = PADV << h->param.b_interlaced;
 299             int offset, height;
 300             uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 301             int k;
 302             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 303             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 304             h->fenc->i_lines_weighted += height;
 305             if( height )
 306             {
 307                 for( k = j; k < h->i_ref0; k++ )
 308                     if( h->sh.weight[k][0].weightfn )
 309                     {
 310                         uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 311                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 312                                                  src + offset, frame->i_stride[0],
 313                                                  width, height, &h->sh.weight[k][0] );
 314                     }
 315             }
 316             break;
 317         }
 318     }
 319 }
 320
 321 /* initialize an array of lambda*nbits for all possible mvs */
 322 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 323 {
 324     a->p_cost_mv = h->cost_mv[a->i_lambda];
 325     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 326     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 327 }
 328
 329 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 330 {
 331     /* conduct the analysis using this lamda and QP */
 332     a->i_qp = h->mb.i_qp = i_qp;
 333     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 334
 335     a->i_lambda = x264_lambda_tab[i_qp];
 336     a->i_lambda2 = x264_lambda2_tab[i_qp];
 337
 338     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 339     if( h->param.analyse.i_trellis )
 340     {
 341         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 342         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 343         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 344         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 345     }
 346     h->mb.i_psy_rd_lambda = a->i_lambda;
 347     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 348     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 349
 350 }
 351
 352 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 353 {
 354     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 355
 356     /* mbrd == 1 -> RD mode decision */
 357     /* mbrd == 2 -> RD refinement */
 358     /* mbrd == 3 -> QPRD */
 359     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 360
 361     x264_mb_analyse_init_qp( h, a, i_qp );
 362
 363     h->mb.i_me_method = h->param.analyse.i_me_method;
 364     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 365     if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
 366         h->mb.i_subpel_refine--;
 367     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 368                         && h->mb.i_subpel_refine >= 5;
 369     h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
 370                           (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
 371
 372     h->mb.b_transform_8x8 = 0;
 373     h->mb.b_noise_reduction = 0;
 374
 375     /* I: Intra part */
 376     a->i_satd_i16x16 =
 377     a->i_satd_i8x8   =
 378     a->i_satd_i4x4   =
 379     a->i_satd_i8x8chroma = COST_MAX;
 380
 381     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 382     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 383
 384     a->b_fast_intra = 0;
 385     h->mb.i_skip_intra =
 386         h->mb.b_lossless ? 0 :
 387         a->i_mbrd ? 2 :
 388         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 389
 390     /* II: Inter part P/B frame */
 391     if( h->sh.i_type != SLICE_TYPE_I )
 392     {
 393         int i, j;
 394         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 395         // limit motion search to a slightly smaller range than the theoretical limit,
 396         // since the search may go a few iterations past its given range
 397         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 398
 399         /* Calculate max allowed MV range */
 400 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 401         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 402         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 403         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 404         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 405         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 406         {
 407             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 408             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 409             /* If we're left of the refresh bar, don't reference right of it. */
 410             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 411                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 412         }
 413         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 414         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 415         if( h->mb.i_mb_x == 0 )
 416         {
 417             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 418             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 419             int thread_mvy_range = i_fmv_range;
 420
 421             if( h->i_thread_frames > 1 )
 422             {
 423                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 424                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 425                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 426                 {
 427                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 428                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 429                     for( j=0; j<i_ref; j++ )
 430                     {
 431                         x264_frame_cond_wait( fref[j]->orig, thresh );
 432                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 433                     }
 434                 }
 435
 436                 if( h->param.b_deterministic )
 437                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 438                 if( h->mb.b_interlaced )
 439                     thread_mvy_range >>= 1;
 440
 441                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 442             }
 443
 444             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 445             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 446             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 447             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 448             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 449             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 450             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 451         }
 452 #undef CLIP_FMV
 453
 454         a->l0.me16x16.cost =
 455         a->l0.i_rd16x16    =
 456         a->l0.i_cost8x8    = COST_MAX;
 457
 458         for( i = 0; i < 4; i++ )
 459         {
 460             a->l0.i_cost4x4[i] =
 461             a->l0.i_cost8x4[i] =
 462             a->l0.i_cost4x8[i] = COST_MAX;
 463         }
 464
 465         a->l0.i_cost16x8   =
 466         a->l0.i_cost8x16   = COST_MAX;
 467         if( h->sh.i_type == SLICE_TYPE_B )
 468         {
 469             a->l1.me16x16.cost =
 470             a->l1.i_rd16x16    =
 471             a->l1.i_cost8x8    = COST_MAX;
 472
 473             for( i = 0; i < 4; i++ )
 474             {
 475                 a->l1.i_cost4x4[i] =
 476                 a->l1.i_cost8x4[i] =
 477                 a->l1.i_cost4x8[i] =
 478                 a->i_cost8x8direct[i] = COST_MAX;
 479             }
 480
 481             a->l1.i_cost16x8   =
 482             a->l1.i_cost8x16   =
 483             a->i_rd16x16bi     =
 484             a->i_rd16x16direct =
 485             a->i_rd8x8bi       =
 486             a->i_rd16x8bi      =
 487             a->i_rd8x16bi      =
 488             a->i_cost16x16bi   =
 489             a->i_cost16x16direct =
 490             a->i_cost8x8bi     =
 491             a->i_cost16x8bi    =
 492             a->i_cost8x16bi    = COST_MAX;
 493         }
 494
 495         /* Fast intra decision */
 496         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 497         {
 498             if(   IS_INTRA( h->mb.i_mb_type_left )
 499                || IS_INTRA( h->mb.i_mb_type_top )
 500                || IS_INTRA( h->mb.i_mb_type_topleft )
 501                || IS_INTRA( h->mb.i_mb_type_topright )
 502                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 503                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 504             { /* intra is likely */ }
 505             else
 506             {
 507                 a->b_fast_intra = 1;
 508             }
 509         }
 510         h->mb.b_skip_mc = 0;
 511         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 512             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 513         {
 514             a->b_force_intra = 1;
 515             a->b_fast_intra = 0;
 516         }
 517         else
 518             a->b_force_intra = 0;
 519     }
 520 }
 521
 522 /* Prediction modes allowed for various combinations of neighbors. */
 523 /* Terminated by a -1. */
 524 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 525 static const int8_t i16x16_mode_available[5][5] =
 526 {
 527     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 528     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 529     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 530     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 531     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 532 };
 533
 534 static const int8_t i8x8chroma_mode_available[5][5] =
 535 {
 536     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 537     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 538     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 539     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 540     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 541 };
 542
 543 static const int8_t i4x4_mode_available[5][10] =
 544 {
 545     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 546     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 547     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 548     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 549     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 550 };
 551
 552 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
 553 {
 554     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 555     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 556 }
 557
 558 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 559 {
 560     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 561     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 562 }
 563
 564 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
 565 {
 566     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 567     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 568 }
 569
 570 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 571 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 572 {
 573     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 574
 575     if( do_both_dct || h->mb.b_transform_8x8 )
 576         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 577     if( do_both_dct || !h->mb.b_transform_8x8 )
 578         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 579 }
 580
 581 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
 582 static inline void x264_mb_cache_fenc_satd( x264_t *h )
 583 {
 584     ALIGNED_16( static uint8_t zero[16] ) = {0};
 585     uint8_t *fenc;
 586     int x, y, satd_sum = 0, sa8d_sum = 0;
 587     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 588         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 589     if( !h->mb.i_psy_rd )
 590         return;
 591     for( y = 0; y < 4; y++ )
 592         for( x = 0; x < 4; x++ )
 593         {
 594             fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
 595             h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
 596                                       - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
 597             satd_sum += h->mb.pic.fenc_satd[y][x];
 598         }
 599     for( y = 0; y < 2; y++ )
 600         for( x = 0; x < 2; x++ )
 601         {
 602             fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
 603             h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
 604                                       - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
 605             sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
 606         }
 607     h->mb.pic.fenc_satd_sum = satd_sum;
 608     h->mb.pic.fenc_sa8d_sum = sa8d_sum;
 609 }
 610
 611 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 612 {
 613     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 614
 615     if( a->i_satd_i8x8chroma < COST_MAX )
 616         return;
 617
 618     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 619
 620     /* 8x8 prediction selection for chroma */
 621     if( predict_mode[3] >= 0 && b_merged_satd )
 622     {
 623         int satdu[4], satdv[4];
 624         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 625         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 626         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 627         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 628         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 629         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 630
 631         for( ; *predict_mode >= 0; predict_mode++ )
 632         {
 633             int i_mode = *predict_mode;
 634             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 635
 636             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 637             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 638         }
 639     }
 640     else
 641     {
 642         for( ; *predict_mode >= 0; predict_mode++ )
 643         {
 644             int i_satd;
 645             int i_mode = *predict_mode;
 646
 647             /* we do the prediction */
 648             if( h->mb.b_lossless )
 649                 x264_predict_lossless_8x8_chroma( h, i_mode );
 650             else
 651             {
 652                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 653                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 654             }
 655
 656             /* we calculate the cost */
 657             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 658                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 659                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 660
 661             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 662             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 663         }
 664     }
 665
 666     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 667 }
 668
 669 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 670 {
 671     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 672     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 673     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 674
 675     int i, idx;
 676     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 677
 678     /*---------------- Try all mode and calculate their score ---------------*/
 679
 680     /* 16x16 prediction selection */
 681     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 682
 683     if( b_merged_satd && predict_mode[3] >= 0 )
 684     {
 685         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 686         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 687         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 688             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 689         for( i=0; i<4; i++ )
 690         {
 691             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 692             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 693         }
 694     }
 695     else
 696     {
 697         for( ; *predict_mode >= 0; predict_mode++ )
 698         {
 699             int i_satd;
 700             int i_mode = *predict_mode;
 701
 702             if( h->mb.b_lossless )
 703                 x264_predict_lossless_16x16( h, i_mode );
 704             else
 705                 h->predict_16x16[i_mode]( p_dst );
 706
 707             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 708                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 709             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 710             a->i_satd_i16x16_dir[i_mode] = i_satd;
 711         }
 712     }
 713
 714     if( h->sh.i_type == SLICE_TYPE_B )
 715         /* cavlc mb type prefix */
 716         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 717     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 718         return;
 719
 720     /* 8x8 prediction selection */
 721     if( flags & X264_ANALYSE_I8x8 )
 722     {
 723         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 724         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 725         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 726         int i_cost = 0;
 727         h->mb.i_cbp_luma = 0;
 728         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 729
 730         // FIXME some bias like in i4x4?
 731         if( h->sh.i_type == SLICE_TYPE_B )
 732             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 733
 734         for( idx = 0;; idx++ )
 735         {
 736             int x = idx&1;
 737             int y = idx>>1;
 738             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 739             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 740             int i_best = COST_MAX;
 741             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 742
 743             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 744             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 745
 746             if( b_merged_satd && predict_mode[8] >= 0 )
 747             {
 748                 int satd[9];
 749                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 750                 satd[i_pred_mode] -= 3 * a->i_lambda;
 751                 for( i=2; i>=0; i-- )
 752                 {
 753                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 754                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 755                 }
 756                 predict_mode += 3;
 757             }
 758
 759             for( ; *predict_mode >= 0; predict_mode++ )
 760             {
 761                 int i_satd;
 762                 int i_mode = *predict_mode;
 763
 764                 if( h->mb.b_lossless )
 765                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 766                 else
 767                     h->predict_8x8[i_mode]( p_dst_by, edge );
 768
 769                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
 770                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 771                     i_satd -= a->i_lambda * 3;
 772
 773                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 774                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 775             }
 776             i_cost += i_best;
 777
 778             if( idx == 3 || i_cost > i_satd_thresh )
 779                 break;
 780
 781             /* we need to encode this block now (for next ones) */
 782             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 783             x264_mb_encode_i8x8( h, idx, a->i_qp );
 784
 785             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 786         }
 787
 788         if( idx == 3 )
 789         {
 790             a->i_satd_i8x8 = i_cost;
 791             if( h->mb.i_skip_intra )
 792             {
 793                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 794                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 795                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 796                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 797                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 798                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 799                 if( h->mb.i_skip_intra == 2 )
 800                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 801             }
 802         }
 803         else
 804         {
 805             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 806             a->i_satd_i8x8 = COST_MAX;
 807             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 808         }
 809         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 810             return;
 811     }
 812
 813     /* 4x4 prediction selection */
 814     if( flags & X264_ANALYSE_I4x4 )
 815     {
 816         int i_cost;
 817         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 818         h->mb.i_cbp_luma = 0;
 819         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 820         if( a->i_mbrd )
 821             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 822
 823         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 824         if( h->sh.i_type == SLICE_TYPE_B )
 825             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 826
 827         for( idx = 0;; idx++ )
 828         {
 829             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 830             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 831             int i_best = COST_MAX;
 832             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 833
 834             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 835
 836             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 837                 /* emulate missing topright samples */
 838                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 839
 840             if( b_merged_satd && predict_mode[5] >= 0 )
 841             {
 842                 int satd[9];
 843                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 844                 satd[i_pred_mode] -= 3 * a->i_lambda;
 845                 for( i=2; i>=0; i-- )
 846                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 847                 predict_mode += 3;
 848             }
 849
 850             for( ; *predict_mode >= 0; predict_mode++ )
 851             {
 852                 int i_satd;
 853                 int i_mode = *predict_mode;
 854
 855                 if( h->mb.b_lossless )
 856                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 857                 else
 858                     h->predict_4x4[i_mode]( p_dst_by );
 859
 860                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 861                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 862                     i_satd -= a->i_lambda * 3;
 863
 864                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 865             }
 866             i_cost += i_best + 4 * a->i_lambda;
 867
 868             if( i_cost > i_satd_thresh || idx == 15 )
 869                 break;
 870
 871             /* we need to encode this block now (for next ones) */
 872             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 873             x264_mb_encode_i4x4( h, idx, a->i_qp );
 874
 875             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 876         }
 877         if( idx == 15 )
 878         {
 879             a->i_satd_i4x4 = i_cost;
 880             if( h->mb.i_skip_intra )
 881             {
 882                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 883                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 884                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 885                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 886                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 887                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 888                 if( h->mb.i_skip_intra == 2 )
 889                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 890             }
 891         }
 892         else
 893             a->i_satd_i4x4 = COST_MAX;
 894     }
 895 }
 896
 897 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 898 {
 899     if( a->i_satd_i16x16 <= i_satd_thresh )
 900     {
 901         h->mb.i_type = I_16x16;
 902         x264_analyse_update_cache( h, a );
 903         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 904     }
 905     else
 906         a->i_satd_i16x16 = COST_MAX;
 907
 908     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 909     {
 910         h->mb.i_type = I_4x4;
 911         x264_analyse_update_cache( h, a );
 912         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 913     }
 914     else
 915         a->i_satd_i4x4 = COST_MAX;
 916
 917     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 918     {
 919         h->mb.i_type = I_8x8;
 920         x264_analyse_update_cache( h, a );
 921         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 922         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 923     }
 924     else
 925         a->i_satd_i8x8 = COST_MAX;
 926 }
 927
 928 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 929 {
 930     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 931
 932     int i, idx, x, y;
 933     int i_mode, i_thresh;
 934     uint64_t i_satd, i_best;
 935     h->mb.i_skip_intra = 0;
 936
 937     if( h->mb.i_type == I_16x16 )
 938     {
 939         int old_pred_mode = a->i_predict16x16;
 940         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 941         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 942         i_best = a->i_satd_i16x16;
 943         for( ; *predict_mode >= 0; predict_mode++ )
 944         {
 945             int i_mode = *predict_mode;
 946             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 947                 continue;
 948             h->mb.i_intra16x16_pred_mode = i_mode;
 949             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 950             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 951         }
 952     }
 953
 954     /* RD selection for chroma prediction */
 955     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 956     if( predict_mode[1] >= 0 )
 957     {
 958         int8_t predict_mode_sorted[4];
 959         int i_max;
 960         i_thresh = a->i_satd_i8x8chroma * 5/4;
 961
 962         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 963         {
 964             i_mode = *predict_mode;
 965             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 966                 predict_mode_sorted[i_max++] = i_mode;
 967         }
 968
 969         if( i_max > 0 )
 970         {
 971             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 972             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 973             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 974              * coefs for the current chroma mode are still around, so we only
 975              * have to recount the bits. */
 976             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 977             for( i = 0; i < i_max; i++ )
 978             {
 979                 i_mode = predict_mode_sorted[i];
 980                 if( h->mb.b_lossless )
 981                     x264_predict_lossless_8x8_chroma( h, i_mode );
 982                 else
 983                 {
 984                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 985                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 986                 }
 987                 /* if we've already found a mode that needs no residual, then
 988                  * probably any mode with a residual will be worse.
 989                  * so avoid dct on the remaining modes to improve speed. */
 990                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 991                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 992             }
 993             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 994             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 995         }
 996     }
 997
 998     if( h->mb.i_type == I_4x4 )
 999     {
1000         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1001         int i_nnz = 0;
1002         for( idx = 0; idx < 16; idx++ )
1003         {
1004             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1005             i_best = COST_MAX64;
1006
1007             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1008
1009             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1010                 /* emulate missing topright samples */
1011                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1012
1013             for( ; *predict_mode >= 0; predict_mode++ )
1014             {
1015                 i_mode = *predict_mode;
1016                 if( h->mb.b_lossless )
1017                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1018                 else
1019                     h->predict_4x4[i_mode]( p_dst_by );
1020                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1021
1022                 if( i_best > i_satd )
1023                 {
1024                     a->i_predict4x4[idx] = i_mode;
1025                     i_best = i_satd;
1026                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1027                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1028                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1029                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1030                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1031                 }
1032             }
1033
1034             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1035             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1036             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1037             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1038             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1039
1040             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1041         }
1042     }
1043     else if( h->mb.i_type == I_8x8 )
1044     {
1045         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1046         for( idx = 0; idx < 4; idx++ )
1047         {
1048             uint64_t pels_h = 0;
1049             uint8_t pels_v[7];
1050             uint16_t i_nnz[2] = {0}; //shut up gcc
1051             uint8_t *p_dst_by;
1052             int j;
1053             int cbp_luma_new = 0;
1054             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1055
1056             i_best = COST_MAX64;
1057             x = idx&1;
1058             y = idx>>1;
1059
1060             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1061             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1062             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1063
1064             for( ; *predict_mode >= 0; predict_mode++ )
1065             {
1066                 i_mode = *predict_mode;
1067                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1068                     continue;
1069
1070                 if( h->mb.b_lossless )
1071                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1072                 else
1073                     h->predict_8x8[i_mode]( p_dst_by, edge );
1074                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1075                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1076
1077                 if( i_best > i_satd )
1078                 {
1079                     a->i_predict8x8[idx] = i_mode;
1080                     cbp_luma_new = h->mb.i_cbp_luma;
1081                     i_best = i_satd;
1082
1083                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1084                     if( !(idx&1) )
1085                         for( j=0; j<7; j++ )
1086                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1087                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1088                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1089                 }
1090             }
1091             a->i_cbp_i8x8_luma = cbp_luma_new;
1092             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1093             if( !(idx&1) )
1094                 for( j=0; j<7; j++ )
1095                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1096             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1097             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1098
1099             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1100         }
1101     }
1102 }
1103
1104 #define LOAD_FENC( m, src, xoff, yoff) \
1105     (m)->p_cost_mv = a->p_cost_mv; \
1106     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1107     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1108     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1109     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1110     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1111
1112 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1113     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1114     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1116     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1117     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1118     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1119     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1120     (m)->weight = weight_none; \
1121     (m)->i_ref = ref;
1122
1123 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1124     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1125     (m)->weight = h->sh.weight[i_ref];
1126
1127 #define REF_COST(list, ref) \
1128     (a->p_cost_ref[list][ref])
1129
1130 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1131 {
1132     x264_me_t m;
1133     int i_ref, i_mvc;
1134     ALIGNED_4( int16_t mvc[8][2] );
1135     int i_halfpel_thresh = INT_MAX;
1136     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1137
1138     /* 16x16 Search on all ref frame */
1139     m.i_pixel = PIXEL_16x16;
1140     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1141
1142     a->l0.me16x16.cost = INT_MAX;
1143     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1144     {
1145         const int i_ref_cost = REF_COST( 0, i_ref );
1146         i_halfpel_thresh -= i_ref_cost;
1147         m.i_ref_cost = i_ref_cost;
1148
1149         /* search with ref */
1150         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1151         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1152
1153         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1154         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1155
1156         if( h->mb.ref_blind_dupe == i_ref )
1157         {
1158             CP32( m.mv, a->l0.mvc[0][0] );
1159             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1160         }
1161         else
1162             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1163
1164         /* early termination
1165          * SSD threshold would probably be better than SATD */
1166         if( i_ref == 0
1167             && a->b_try_pskip
1168             && m.cost-m.cost_mv < 300*a->i_lambda
1169             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1170               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1171             && x264_macroblock_probe_pskip( h ) )
1172         {
1173             h->mb.i_type = P_SKIP;
1174             x264_analyse_update_cache( h, a );
1175             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1176             return;
1177         }
1178
1179         m.cost += i_ref_cost;
1180         i_halfpel_thresh += i_ref_cost;
1181
1182         if( m.cost < a->l0.me16x16.cost )
1183             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1184
1185         /* save mv for predicting neighbors */
1186         CP32( a->l0.mvc[i_ref][0], m.mv );
1187         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1188     }
1189
1190     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1191     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1192
1193     h->mb.i_type = P_L0;
1194     if( a->i_mbrd )
1195     {
1196         x264_mb_cache_fenc_satd( h );
1197         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1198         {
1199             h->mb.i_partition = D_16x16;
1200             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1201             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1202             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1203                 h->mb.i_type = P_SKIP;
1204         }
1205     }
1206 }
1207
1208 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1209 {
1210     x264_me_t m;
1211     int i_ref;
1212     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1213     int i_halfpel_thresh = INT_MAX;
1214     int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1215     int i;
1216     int i_maxref = h->mb.pic.i_fref[0]-1;
1217
1218     h->mb.i_partition = D_8x8;
1219
1220     #define CHECK_NEIGHBOUR(i)\
1221     {\
1222         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1223         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1224             i_maxref = ref;\
1225     }
1226
1227     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1228      * than those used by the neighbors */
1229     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1230         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1231     {
1232         i_maxref = 0;
1233         CHECK_NEIGHBOUR(  -8 - 1 );
1234         CHECK_NEIGHBOUR(  -8 + 0 );
1235         CHECK_NEIGHBOUR(  -8 + 2 );
1236         CHECK_NEIGHBOUR(  -8 + 4 );
1237         CHECK_NEIGHBOUR(   0 - 1 );
1238         CHECK_NEIGHBOUR( 2*8 - 1 );
1239     }
1240
1241     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1242         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1243
1244     for( i = 0; i < 4; i++ )
1245     {
1246         x264_me_t *l0m = &a->l0.me8x8[i];
1247         const int x8 = i%2;
1248         const int y8 = i/2;
1249
1250         m.i_pixel = PIXEL_8x8;
1251
1252         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1253         l0m->cost = INT_MAX;
1254         for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1255         {
1256             const int i_ref_cost = REF_COST( 0, i_ref );
1257             m.i_ref_cost = i_ref_cost;
1258
1259             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1260             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1261
1262             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1263             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1264             if( h->mb.ref_blind_dupe == i_ref )
1265             {
1266                 CP32( m.mv, a->l0.mvc[0][i+1] );
1267                 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1268             }
1269             else
1270                 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1271
1272             m.cost += i_ref_cost;
1273             i_halfpel_thresh += i_ref_cost;
1274             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1275
1276             if( m.cost < l0m->cost )
1277                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1278             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1279                 i_ref = h->mb.ref_blind_dupe;
1280             else
1281                 i_ref++;
1282         }
1283         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1284         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1285
1286         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1287            are effectively zero. */
1288         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1289             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1290     }
1291
1292     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1293                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1294     /* P_8x8 ref0 has no ref cost */
1295     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1296                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1297         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1298     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1299     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1300 }
1301
1302 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1303 {
1304     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1305      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1306      * don't bother analysing the dupes. */
1307     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1308     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1309     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1310     int i_mvc;
1311     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1312     int i;
1313
1314     /* XXX Needed for x264_mb_predict_mv */
1315     h->mb.i_partition = D_8x8;
1316
1317     i_mvc = 1;
1318     CP32( mvc[0], a->l0.me16x16.mv );
1319
1320     for( i = 0; i < 4; i++ )
1321     {
1322         x264_me_t *m = &a->l0.me8x8[i];
1323         const int x8 = i%2;
1324         const int y8 = i/2;
1325
1326         m->i_pixel = PIXEL_8x8;
1327         m->i_ref_cost = i_ref_cost;
1328
1329         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1330         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1331         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1332
1333         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1334         x264_me_search( h, m, mvc, i_mvc );
1335
1336         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1337
1338         CP32( mvc[i_mvc], m->mv );
1339         i_mvc++;
1340
1341         /* mb type cost */
1342         m->cost += i_ref_cost;
1343         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1344             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1345     }
1346
1347     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1348                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1349     /* theoretically this should include 4*ref_cost,
1350      * but 3 seems a better approximation of cabac. */
1351     if( h->param.b_cabac )
1352         a->l0.i_cost8x8 -= i_ref_cost;
1353     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1354     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1355 }
1356
1357 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1358 {
1359     x264_me_t m;
1360     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1361     ALIGNED_4( int16_t mvc[3][2] );
1362     int i, j;
1363
1364     /* XXX Needed for x264_mb_predict_mv */
1365     h->mb.i_partition = D_16x8;
1366
1367     for( i = 0; i < 2; i++ )
1368     {
1369         x264_me_t *l0m = &a->l0.me16x8[i];
1370         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1371         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1372         const int ref8[2] = { minref, maxref };
1373         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1374
1375         m.i_pixel = PIXEL_16x8;
1376
1377         LOAD_FENC( &m, p_fenc, 0, 8*i );
1378         l0m->cost = INT_MAX;
1379         for( j = 0; j < i_ref8s; j++ )
1380         {
1381             const int i_ref = ref8[j];
1382             const int i_ref_cost = REF_COST( 0, i_ref );
1383             m.i_ref_cost = i_ref_cost;
1384
1385             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1386             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1387             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1388             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1389
1390             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1391             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1392
1393             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1394             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1395             /* We can only take this shortcut if the first search was performed on ref0. */
1396             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1397             {
1398                 /* We can just leave the MV from the previous ref search. */
1399                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1400             }
1401             else
1402                 x264_me_search( h, &m, mvc, 3 );
1403
1404             m.cost += i_ref_cost;
1405
1406             if( m.cost < l0m->cost )
1407                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1408         }
1409         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1410         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1411     }
1412
1413     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1414 }
1415
1416 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1417 {
1418     x264_me_t m;
1419     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1420     ALIGNED_4( int16_t mvc[3][2] );
1421     int i, j;
1422
1423     /* XXX Needed for x264_mb_predict_mv */
1424     h->mb.i_partition = D_8x16;
1425
1426     for( i = 0; i < 2; i++ )
1427     {
1428         x264_me_t *l0m = &a->l0.me8x16[i];
1429         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1430         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1431         const int ref8[2] = { minref, maxref };
1432         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1433
1434         m.i_pixel = PIXEL_8x16;
1435
1436         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1437         l0m->cost = INT_MAX;
1438         for( j = 0; j < i_ref8s; j++ )
1439         {
1440             const int i_ref = ref8[j];
1441             const int i_ref_cost = REF_COST( 0, i_ref );
1442             m.i_ref_cost = i_ref_cost;
1443
1444             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1445             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1446             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1447
1448             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1449             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1450
1451             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1452             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1453             /* We can only take this shortcut if the first search was performed on ref0. */
1454             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1455             {
1456                 /* We can just leave the MV from the previous ref search. */
1457                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1458             }
1459             else
1460                 x264_me_search( h, &m, mvc, 3 );
1461
1462             m.cost += i_ref_cost;
1463
1464             if( m.cost < l0m->cost )
1465                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1466         }
1467         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1468         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1469     }
1470
1471     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1472 }
1473
1474 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1475 {
1476     ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1477     uint8_t *pix2 = pix1+8;
1478     const int i_stride = h->mb.pic.i_stride[1];
1479     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1480     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1481     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1482     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1483     x264_weight_t *weight = h->sh.weight[i_ref];
1484
1485 #define CHROMA4x4MC( width, height, me, x, y ) \
1486     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1487     if( weight[1].weightfn ) \
1488         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1489     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1490     if( weight[2].weightfn ) \
1491         weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1492
1493
1494     if( pixel == PIXEL_4x4 )
1495     {
1496         x264_me_t *m = a->l0.me4x4[i8x8];
1497         CHROMA4x4MC( 2,2, m[0], 0,0 );
1498         CHROMA4x4MC( 2,2, m[1], 2,0 );
1499         CHROMA4x4MC( 2,2, m[2], 0,2 );
1500         CHROMA4x4MC( 2,2, m[3], 2,2 );
1501     }
1502     else if( pixel == PIXEL_8x4 )
1503     {
1504         x264_me_t *m = a->l0.me8x4[i8x8];
1505         CHROMA4x4MC( 4,2, m[0], 0,0 );
1506         CHROMA4x4MC( 4,2, m[1], 0,2 );
1507     }
1508     else
1509     {
1510         x264_me_t *m = a->l0.me4x8[i8x8];
1511         CHROMA4x4MC( 2,4, m[0], 0,0 );
1512         CHROMA4x4MC( 2,4, m[1], 2,0 );
1513     }
1514
1515     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1516          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1517 }
1518
1519 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1520 {
1521     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1522     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1523     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1524     int i4x4;
1525
1526     /* XXX Needed for x264_mb_predict_mv */
1527     h->mb.i_partition = D_8x8;
1528
1529     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1530     {
1531         const int idx = 4*i8x8 + i4x4;
1532         const int x4 = block_idx_x[idx];
1533         const int y4 = block_idx_y[idx];
1534         const int i_mvc = (i4x4 == 0);
1535
1536         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1537
1538         m->i_pixel = PIXEL_4x4;
1539
1540         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1541         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1542         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1543
1544         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1545         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1546
1547         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1548     }
1549     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1550                             a->l0.me4x4[i8x8][1].cost +
1551                             a->l0.me4x4[i8x8][2].cost +
1552                             a->l0.me4x4[i8x8][3].cost +
1553                             REF_COST( 0, i_ref ) +
1554                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1555     if( h->mb.b_chroma_me )
1556         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1557 }
1558
1559 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1560 {
1561     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1562     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1563     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1564     int i8x4;
1565
1566     /* XXX Needed for x264_mb_predict_mv */
1567     h->mb.i_partition = D_8x8;
1568
1569     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1570     {
1571         const int idx = 4*i8x8 + 2*i8x4;
1572         const int x4 = block_idx_x[idx];
1573         const int y4 = block_idx_y[idx];
1574         const int i_mvc = (i8x4 == 0);
1575
1576         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1577
1578         m->i_pixel = PIXEL_8x4;
1579
1580         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1581         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1582         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1583
1584         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1585         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1586
1587         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1588     }
1589     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1590                             REF_COST( 0, i_ref ) +
1591                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1592     if( h->mb.b_chroma_me )
1593         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1594 }
1595
1596 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1597 {
1598     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1599     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1600     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1601     int i4x8;
1602
1603     /* XXX Needed for x264_mb_predict_mv */
1604     h->mb.i_partition = D_8x8;
1605
1606     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1607     {
1608         const int idx = 4*i8x8 + i4x8;
1609         const int x4 = block_idx_x[idx];
1610         const int y4 = block_idx_y[idx];
1611         const int i_mvc = (i4x8 == 0);
1612
1613         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1614
1615         m->i_pixel = PIXEL_4x8;
1616
1617         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1618         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1619         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1620
1621         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1622         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1623
1624         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1625     }
1626     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1627                             REF_COST( 0, i_ref ) +
1628                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1629     if( h->mb.b_chroma_me )
1630         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1631 }
1632
1633 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1634 {
1635     /* Assumes that fdec still contains the results of
1636      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1637
1638     uint8_t **p_fenc = h->mb.pic.p_fenc;
1639     uint8_t **p_fdec = h->mb.pic.p_fdec;
1640     int i;
1641
1642     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1643     for( i = 0; i < 4; i++ )
1644     {
1645         const int x = (i&1)*8;
1646         const int y = (i>>1)*8;
1647         a->i_cost16x16direct +=
1648         a->i_cost8x8direct[i] =
1649             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1650
1651         /* mb type cost */
1652         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1653     }
1654 }
1655
1656 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1657 {
1658     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1659     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1660     uint8_t *src0, *src1;
1661     int stride0 = 16, stride1 = 16;
1662
1663     x264_me_t m;
1664     int i_ref, i_mvc;
1665     ALIGNED_4( int16_t mvc[9][2] );
1666     int i_halfpel_thresh = INT_MAX;
1667     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1668
1669     /* 16x16 Search on all ref frame */
1670     m.i_pixel = PIXEL_16x16;
1671     m.weight = weight_none;
1672
1673     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1674
1675     /* ME for List 0 */
1676     a->l0.me16x16.cost = INT_MAX;
1677     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1678     {
1679         const int i_ref_cost = REF_COST( 0, i_ref );
1680         m.i_ref_cost = i_ref_cost;
1681         /* search with ref */
1682         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1683         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1684         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1685         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1686
1687         /* add ref cost */
1688         m.cost += i_ref_cost;
1689
1690         if( m.cost < a->l0.me16x16.cost )
1691         {
1692             a->l0.i_ref = i_ref;
1693             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1694         }
1695
1696         /* save mv for predicting neighbors */
1697         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1698     }
1699     a->l0.me16x16.i_ref = a->l0.i_ref;
1700
1701     /* ME for list 1 */
1702     i_halfpel_thresh = INT_MAX;
1703     p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1704     a->l1.me16x16.cost = INT_MAX;
1705     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1706     {
1707         const int i_ref_cost = REF_COST( 0, i_ref );
1708         m.i_ref_cost = i_ref_cost;
1709         /* search with ref */
1710         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1711         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1712         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1713         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1714
1715         /* add ref cost */
1716         m.cost += i_ref_cost;
1717
1718         if( m.cost < a->l1.me16x16.cost )
1719         {
1720             a->l1.i_ref = i_ref;
1721             h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1722         }
1723
1724         /* save mv for predicting neighbors */
1725         CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1726     }
1727     a->l1.me16x16.i_ref = a->l1.i_ref;
1728
1729     /* get cost of BI mode */
1730     int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
1731     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1732     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1733     src0 = h->mc.get_ref( pix0, &stride0,
1734                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1735                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1736     src1 = h->mc.get_ref( pix1, &stride1,
1737                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1738                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1739
1740     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1741
1742     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1743                      + ref_costs
1744                      + a->l0.bi16x16.cost_mv
1745                      + a->l1.bi16x16.cost_mv;
1746
1747
1748     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1749     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1750     {
1751         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1752                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1753         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1754                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1755         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
1756                                 h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
1757                                 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1758         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1759                    + ref_costs + l0_mv_cost + l1_mv_cost;
1760         if( cost00 < a->i_cost16x16bi )
1761         {
1762             M32( a->l0.bi16x16.mv ) = 0;
1763             M32( a->l1.bi16x16.mv ) = 0;
1764             a->l0.bi16x16.cost_mv = l0_mv_cost;
1765             a->l1.bi16x16.cost_mv = l1_mv_cost;
1766             a->i_cost16x16bi = cost00;
1767         }
1768     }
1769
1770     /* mb type cost */
1771     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1772     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1773     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1774 }
1775
1776 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1777 {
1778     const int x = 2*(i%2);
1779     const int y = 2*(i/2);
1780
1781     switch( h->mb.i_sub_partition[i] )
1782     {
1783         case D_L0_8x8:
1784             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1785             break;
1786         case D_L0_8x4:
1787             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1788             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1789             break;
1790         case D_L0_4x8:
1791             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1792             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1793             break;
1794         case D_L0_4x4:
1795             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1796             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1797             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1798             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1799             break;
1800         default:
1801             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1802             break;
1803     }
1804 }
1805
1806 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1807 {
1808     const int x = 2*(idx&1);
1809     const int y = 2*(idx>>1);
1810     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1811     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1812     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1813     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1814 }
1815
1816 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1817     if( x264_mb_partition_listX_table[0][part] ) \
1818     { \
1819         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1820         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1821     } \
1822     else \
1823     { \
1824         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1825         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1826         if( b_mvd ) \
1827             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1828     } \
1829     if( x264_mb_partition_listX_table[1][part] ) \
1830     { \
1831         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1832         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1833     } \
1834     else \
1835     { \
1836         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1837         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1838         if( b_mvd ) \
1839             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1840     }
1841
1842 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1843 {
1844     int x = (i%2)*2;
1845     int y = (i/2)*2;
1846     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1847     {
1848         x264_mb_load_mv_direct8x8( h, i );
1849         if( b_mvd )
1850         {
1851             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1852             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1853             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1854         }
1855     }
1856     else
1857     {
1858         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1859     }
1860 }
1861 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1862 {
1863     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1864 }
1865 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1866 {
1867     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1868 }
1869 #undef CACHE_MV_BI
1870
1871 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1872 {
1873     uint8_t **p_fref[2] =
1874         { h->mb.pic.p_fref[0][a->l0.i_ref],
1875           h->mb.pic.p_fref[1][a->l1.i_ref] };
1876     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1877     int i, l;
1878
1879     /* XXX Needed for x264_mb_predict_mv */
1880     h->mb.i_partition = D_8x8;
1881
1882     a->i_cost8x8bi = 0;
1883
1884     for( i = 0; i < 4; i++ )
1885     {
1886         const int x8 = i%2;
1887         const int y8 = i/2;
1888         int i_part_cost;
1889         int i_part_cost_bi = 0;
1890         int stride[2] = {8,8};
1891         uint8_t *src[2];
1892
1893         for( l = 0; l < 2; l++ )
1894         {
1895             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1896             const int i_ref_cost = REF_COST( l, lX->i_ref );
1897             x264_me_t *m = &lX->me8x8[i];
1898
1899             m->i_pixel = PIXEL_8x8;
1900             m->i_ref_cost = i_ref_cost;
1901
1902             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1903             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1904
1905             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1906             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1907             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1908             m->cost += i_ref_cost;
1909
1910             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1911
1912             /* BI mode */
1913             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1914                                     m->mv[0], m->mv[1], 8, 8, weight_none );
1915             i_part_cost_bi += m->cost_mv + i_ref_cost;
1916         }
1917         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1918         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1919                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1920         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1921         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1922
1923         i_part_cost = a->l0.me8x8[i].cost;
1924         h->mb.i_sub_partition[i] = D_L0_8x8;
1925         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1926         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1927         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1928         a->i_cost8x8bi += i_part_cost;
1929
1930         /* XXX Needed for x264_mb_predict_mv */
1931         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1932     }
1933
1934     /* mb type cost */
1935     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1936 }
1937
1938 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1939 {
1940     uint8_t **p_fref[2] =
1941         { h->mb.pic.p_fref[0][a->l0.i_ref],
1942           h->mb.pic.p_fref[1][a->l1.i_ref] };
1943     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1944     ALIGNED_4( int16_t mvc[2][2] );
1945     int i, l;
1946
1947     h->mb.i_partition = D_16x8;
1948     a->i_cost16x8bi = 0;
1949
1950     for( i = 0; i < 2; i++ )
1951     {
1952         int i_part_cost;
1953         int i_part_cost_bi = 0;
1954         int stride[2] = {16,16};
1955         uint8_t *src[2];
1956
1957         /* TODO: check only the list(s) that were used in b8x8? */
1958         for( l = 0; l < 2; l++ )
1959         {
1960             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1961             const int i_ref_cost = REF_COST( l, lX->i_ref );
1962             x264_me_t *m = &lX->me16x8[i];
1963
1964             m->i_pixel = PIXEL_16x8;
1965             m->i_ref_cost = i_ref_cost;
1966
1967             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1968             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1969
1970             CP32( mvc[0], lX->me8x8[2*i].mv );
1971             CP32( mvc[1], lX->me8x8[2*i+1].mv );
1972
1973             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1974             x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1975             x264_me_search( h, m, mvc, 2 );
1976             m->cost += i_ref_cost;
1977
1978             /* BI mode */
1979             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1980                                     m->mv[0], m->mv[1], 16, 8, weight_none );
1981             i_part_cost_bi += m->cost_mv + i_ref_cost;
1982         }
1983         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1984         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1985
1986         i_part_cost = a->l0.me16x8[i].cost;
1987         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1988         if( a->l1.me16x8[i].cost < i_part_cost )
1989         {
1990             i_part_cost = a->l1.me16x8[i].cost;
1991             a->i_mb_partition16x8[i] = D_L1_8x8;
1992         }
1993         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1994         {
1995             i_part_cost = i_part_cost_bi;
1996             a->i_mb_partition16x8[i] = D_BI_8x8;
1997         }
1998         a->i_cost16x8bi += i_part_cost;
1999
2000         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2001     }
2002
2003     /* mb type cost */
2004     a->i_mb_type16x8 = B_L0_L0
2005         + (a->i_mb_partition16x8[0]>>2) * 3
2006         + (a->i_mb_partition16x8[1]>>2);
2007     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2008 }
2009
2010 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2011 {
2012     uint8_t **p_fref[2] =
2013         { h->mb.pic.p_fref[0][a->l0.i_ref],
2014           h->mb.pic.p_fref[1][a->l1.i_ref] };
2015     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2016     ALIGNED_4( int16_t mvc[2][2] );
2017     int i, l;
2018
2019     h->mb.i_partition = D_8x16;
2020     a->i_cost8x16bi = 0;
2021
2022     for( i = 0; i < 2; i++ )
2023     {
2024         int i_part_cost;
2025         int i_part_cost_bi = 0;
2026         int stride[2] = {8,8};
2027         uint8_t *src[2];
2028
2029         for( l = 0; l < 2; l++ )
2030         {
2031             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2032             const int i_ref_cost = REF_COST( l, lX->i_ref );
2033             x264_me_t *m = &lX->me8x16[i];
2034
2035             m->i_pixel = PIXEL_8x16;
2036             m->i_ref_cost = i_ref_cost;
2037
2038             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2039             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2040
2041             CP32( mvc[0], lX->me8x8[i].mv );
2042             CP32( mvc[1], lX->me8x8[i+2].mv );
2043
2044             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2045             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2046             x264_me_search( h, m, mvc, 2 );
2047             m->cost += i_ref_cost;
2048
2049             /* BI mode */
2050             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
2051                                     m->mv[0], m->mv[1], 8, 16, weight_none );
2052             i_part_cost_bi += m->cost_mv + i_ref_cost;
2053         }
2054
2055         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2056         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2057
2058         i_part_cost = a->l0.me8x16[i].cost;
2059         a->i_mb_partition8x16[i] = D_L0_8x8;
2060         if( a->l1.me8x16[i].cost < i_part_cost )
2061         {
2062             i_part_cost = a->l1.me8x16[i].cost;
2063             a->i_mb_partition8x16[i] = D_L1_8x8;
2064         }
2065         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2066         {
2067             i_part_cost = i_part_cost_bi;
2068             a->i_mb_partition8x16[i] = D_BI_8x8;
2069         }
2070         a->i_cost8x16bi += i_part_cost;
2071
2072         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2073     }
2074
2075     /* mb type cost */
2076     a->i_mb_type8x16 = B_L0_L0
2077         + (a->i_mb_partition8x16[0]>>2) * 3
2078         + (a->i_mb_partition8x16[1]>>2);
2079     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2080 }
2081
2082 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2083 {
2084     int thresh = i_satd * 5/4;
2085
2086     h->mb.i_type = P_L0;
2087     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2088     {
2089         h->mb.i_partition = D_16x16;
2090         x264_analyse_update_cache( h, a );
2091         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2092     }
2093
2094     if( a->l0.i_cost16x8 <= thresh )
2095     {
2096         h->mb.i_partition = D_16x8;
2097         x264_analyse_update_cache( h, a );
2098         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2099     }
2100     else
2101         a->l0.i_cost16x8 = COST_MAX;
2102
2103     if( a->l0.i_cost8x16 <= thresh )
2104     {
2105         h->mb.i_partition = D_8x16;
2106         x264_analyse_update_cache( h, a );
2107         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2108     }
2109     else
2110         a->l0.i_cost8x16 = COST_MAX;
2111
2112     if( a->l0.i_cost8x8 <= thresh )
2113     {
2114         h->mb.i_type = P_8x8;
2115         h->mb.i_partition = D_8x8;
2116         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2117         {
2118             int i;
2119             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2120             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2121             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2122             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2123             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2124              * for future blocks are those left over from previous RDO calls. */
2125             for( i = 0; i < 4; i++ )
2126             {
2127                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2128                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2129                 int subtype, btype = D_L0_8x8;
2130                 uint64_t bcost = COST_MAX64;
2131                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2132                 {
2133                     uint64_t cost;
2134                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2135                         continue;
2136                     h->mb.i_sub_partition[i] = subtype;
2137                     x264_mb_cache_mv_p8x8( h, a, i );
2138                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2139                     COPY2_IF_LT( bcost, cost, btype, subtype );
2140                 }
2141                 if( h->mb.i_sub_partition[i] != btype )
2142                 {
2143                     h->mb.i_sub_partition[i] = btype;
2144                     x264_mb_cache_mv_p8x8( h, a, i );
2145                 }
2146             }
2147         }
2148         else
2149             x264_analyse_update_cache( h, a );
2150         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2151     }
2152     else
2153         a->l0.i_cost8x8 = COST_MAX;
2154 }
2155
2156 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2157 {
2158     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2159
2160     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2161     {
2162         h->mb.i_type = B_DIRECT;
2163         /* Assumes direct/skip MC is still in fdec */
2164         /* Requires b-rdo to be done before intra analysis */
2165         h->mb.b_skip_mc = 1;
2166         x264_analyse_update_cache( h, a );
2167         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2168         h->mb.b_skip_mc = 0;
2169     }
2170
2171     //FIXME not all the update_cache calls are needed
2172     h->mb.i_partition = D_16x16;
2173     /* L0 */
2174     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2175     {
2176         h->mb.i_type = B_L0_L0;
2177         x264_analyse_update_cache( h, a );
2178         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2179     }
2180
2181     /* L1 */
2182     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2183     {
2184         h->mb.i_type = B_L1_L1;
2185         x264_analyse_update_cache( h, a );
2186         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2187     }
2188
2189     /* BI */
2190     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2191     {
2192         h->mb.i_type = B_BI_BI;
2193         x264_analyse_update_cache( h, a );
2194         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2195     }
2196
2197     /* 8x8 */
2198     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2199     {
2200         h->mb.i_type = B_8x8;
2201         h->mb.i_partition = D_8x8;
2202         x264_analyse_update_cache( h, a );
2203         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2204         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2205     }
2206
2207     /* 16x8 */
2208     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2209     {
2210         h->mb.i_type = a->i_mb_type16x8;
2211         h->mb.i_partition = D_16x8;
2212         x264_analyse_update_cache( h, a );
2213         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2214     }
2215
2216     /* 8x16 */
2217     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2218     {
2219         h->mb.i_type = a->i_mb_type8x16;
2220         h->mb.i_partition = D_8x16;
2221         x264_analyse_update_cache( h, a );
2222         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2223     }
2224 }
2225
2226 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2227 {
2228     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2229     int i;
2230
2231     if( IS_INTRA(h->mb.i_type) )
2232         return;
2233
2234     switch( h->mb.i_partition )
2235     {
2236         case D_16x16:
2237             if( h->mb.i_type == B_BI_BI )
2238                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2239             break;
2240         case D_16x8:
2241             for( i=0; i<2; i++ )
2242                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2243                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2244             break;
2245         case D_8x16:
2246             for( i=0; i<2; i++ )
2247                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2248                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2249             break;
2250         case D_8x8:
2251             for( i=0; i<4; i++ )
2252                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2253                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2254             break;
2255     }
2256 }
2257
2258 static inline void x264_mb_analyse_transform( x264_t *h )
2259 {
2260     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2261     {
2262         int i_cost4, i_cost8;
2263         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2264         x264_mb_mc( h );
2265
2266         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2267                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2268         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2269                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2270
2271         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2272         h->mb.b_skip_mc = 1;
2273     }
2274 }
2275
2276 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2277 {
2278     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2279     {
2280         int i_rd8;
2281         x264_analyse_update_cache( h, a );
2282         h->mb.b_transform_8x8 ^= 1;
2283         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2284         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2285
2286         if( *i_rd >= i_rd8 )
2287         {
2288             if( *i_rd > 0 )
2289                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2290             *i_rd = i_rd8;
2291         }
2292         else
2293             h->mb.b_transform_8x8 ^= 1;
2294     }
2295 }
2296
2297 /* Rate-distortion optimal QP selection.
2298  * FIXME: More than half of the benefit of this function seems to be
2299  * in the way it improves the coding of chroma DC (by decimating or
2300  * finding a better way to code a single DC coefficient.)
2301  * There must be a more efficient way to get that portion of the benefit
2302  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2303  * trick. */
2304 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2305 {
2306     int bcost, cost, direction, failures, prevcost, origcost;
2307     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2308     int last_qp_tried = 0;
2309     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2310
2311     /* If CBP is already zero, don't raise the quantizer any higher. */
2312     for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2313     {
2314         /* Without psy-RD, require monotonicity when moving quant away from previous
2315          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2316          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2317          * allow 2 failures when moving quant towards previous quant.
2318          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2319         int threshold = (!!h->mb.i_psy_rd);
2320         /* Raise the threshold for failures if we're moving towards the last QP. */
2321         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2322             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2323             threshold++;
2324         h->mb.i_qp = orig_qp;
2325         failures = 0;
2326         prevcost = origcost;
2327         h->mb.i_qp += direction;
2328         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2329         {
2330             if( h->mb.i_last_qp == h->mb.i_qp )
2331                 last_qp_tried = 1;
2332             h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2333             cost = x264_rd_cost_mb( h, a->i_lambda2 );
2334             COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2335
2336             /* We can't assume that the costs are monotonic over QPs.
2337              * Tie case-as-failure seems to give better results. */
2338             if( cost < prevcost )
2339                 failures = 0;
2340             else
2341                 failures++;
2342             prevcost = cost;
2343
2344             if( failures > threshold )
2345                 break;
2346             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2347                 break;
2348             h->mb.i_qp += direction;
2349         }
2350     }
2351
2352     /* Always try the last block's QP. */
2353     if( !last_qp_tried )
2354     {
2355         h->mb.i_qp = h->mb.i_last_qp;
2356         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2357         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2358         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2359     }
2360
2361     h->mb.i_qp = bqp;
2362     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2363
2364     /* Check transform again; decision from before may no longer be optimal. */
2365     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2366         x264_mb_transform_8x8_allowed( h ) )
2367     {
2368         h->mb.b_transform_8x8 ^= 1;
2369         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2370         if( cost > bcost )
2371             h->mb.b_transform_8x8 ^= 1;
2372     }
2373 }
2374
2375 /*****************************************************************************
2376  * x264_macroblock_analyse:
2377  *****************************************************************************/
2378 void x264_macroblock_analyse( x264_t *h )
2379 {
2380     x264_mb_analysis_t analysis;
2381     int i_cost = COST_MAX;
2382     int i;
2383
2384     h->mb.i_qp = x264_ratecontrol_qp( h );
2385     if( h->param.rc.i_aq_mode )
2386     {
2387         x264_adaptive_quant( h );
2388         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2389          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2390         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2391             h->mb.i_qp = h->mb.i_last_qp;
2392     }
2393
2394     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2395
2396     /*--------------------------- Do the analysis ---------------------------*/
2397     if( h->sh.i_type == SLICE_TYPE_I )
2398     {
2399 intra_analysis:
2400         if( analysis.i_mbrd )
2401             x264_mb_cache_fenc_satd( h );
2402         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2403         if( analysis.i_mbrd )
2404             x264_intra_rd( h, &analysis, COST_MAX );
2405
2406         i_cost = analysis.i_satd_i16x16;
2407         h->mb.i_type = I_16x16;
2408         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2409         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2410         if( analysis.i_satd_pcm < i_cost )
2411             h->mb.i_type = I_PCM;
2412
2413         else if( analysis.i_mbrd >= 2 )
2414             x264_intra_rd_refine( h, &analysis );
2415     }
2416     else if( h->sh.i_type == SLICE_TYPE_P )
2417     {
2418         int b_skip = 0;
2419
2420         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2421
2422         analysis.b_try_pskip = 0;
2423         if( analysis.b_force_intra )
2424         {
2425             if( !h->param.analyse.b_psy )
2426             {
2427                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2428                 goto intra_analysis;
2429             }
2430         }
2431         else
2432         {
2433             /* Fast P_SKIP detection */
2434             if( h->param.analyse.b_fast_pskip )
2435             {
2436                 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2437                     // FIXME don't need to check this if the reference frame is done
2438                     {}
2439                 else if( h->param.analyse.i_subpel_refine >= 3 )
2440                     analysis.b_try_pskip = 1;
2441                 else if( h->mb.i_mb_type_left == P_SKIP ||
2442                          h->mb.i_mb_type_top == P_SKIP ||
2443                          h->mb.i_mb_type_topleft == P_SKIP ||
2444                          h->mb.i_mb_type_topright == P_SKIP )
2445                     b_skip = x264_macroblock_probe_pskip( h );
2446             }
2447         }
2448
2449         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2450
2451         if( b_skip )
2452         {
2453             h->mb.i_type = P_SKIP;
2454             h->mb.i_partition = D_16x16;
2455             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2456         }
2457         else
2458         {
2459             const unsigned int flags = h->param.analyse.inter;
2460             int i_type;
2461             int i_partition;
2462             int i_thresh16x8;
2463             int i_satd_inter, i_satd_intra;
2464
2465             x264_mb_analyse_load_costs( h, &analysis );
2466
2467             x264_mb_analyse_inter_p16x16( h, &analysis );
2468
2469             if( h->mb.i_type == P_SKIP )
2470                 return;
2471
2472             if( flags & X264_ANALYSE_PSUB16x16 )
2473             {
2474                 if( h->param.analyse.b_mixed_references )
2475                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2476                 else
2477                     x264_mb_analyse_inter_p8x8( h, &analysis );
2478             }
2479
2480             /* Select best inter mode */
2481             i_type = P_L0;
2482             i_partition = D_16x16;
2483             i_cost = analysis.l0.me16x16.cost;
2484
2485             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2486                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2487             {
2488                 i_type = P_8x8;
2489                 i_partition = D_8x8;
2490                 i_cost = analysis.l0.i_cost8x8;
2491
2492                 /* Do sub 8x8 */
2493                 if( flags & X264_ANALYSE_PSUB8x8 )
2494                 {
2495                     for( i = 0; i < 4; i++ )
2496                     {
2497                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2498                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2499                         {
2500                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2501                             h->mb.i_sub_partition[i] = D_L0_4x4;
2502
2503                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2504                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2505                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2506
2507                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2508                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2509                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2510
2511                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2512                         }
2513                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2514                     }
2515                     analysis.l0.i_cost8x8 = i_cost;
2516                 }
2517             }
2518
2519             /* Now do 16x8/8x16 */
2520             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2521             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2522                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2523             {
2524                 x264_mb_analyse_inter_p16x8( h, &analysis );
2525                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2526
2527                 x264_mb_analyse_inter_p8x16( h, &analysis );
2528                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2529             }
2530
2531             h->mb.i_partition = i_partition;
2532
2533             /* refine qpel */
2534             //FIXME mb_type costs?
2535             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2536             {
2537                 /* refine later */
2538             }
2539             else if( i_partition == D_16x16 )
2540             {
2541                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2542                 i_cost = analysis.l0.me16x16.cost;
2543             }
2544             else if( i_partition == D_16x8 )
2545             {
2546                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2547                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2548                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2549             }
2550             else if( i_partition == D_8x16 )
2551             {
2552                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2553                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2554                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2555             }
2556             else if( i_partition == D_8x8 )
2557             {
2558                 int i8x8;
2559                 i_cost = 0;
2560                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2561                 {
2562                     switch( h->mb.i_sub_partition[i8x8] )
2563                     {
2564                         case D_L0_8x8:
2565                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2566                             i_cost += analysis.l0.me8x8[i8x8].cost;
2567                             break;
2568                         case D_L0_8x4:
2569                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2570                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2571                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2572                                       analysis.l0.me8x4[i8x8][1].cost;
2573                             break;
2574                         case D_L0_4x8:
2575                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2576                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2577                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2578                                       analysis.l0.me4x8[i8x8][1].cost;
2579                             break;
2580
2581                         case D_L0_4x4:
2582                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2583                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2584                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2585                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2586                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2587                                       analysis.l0.me4x4[i8x8][1].cost +
2588                                       analysis.l0.me4x4[i8x8][2].cost +
2589                                       analysis.l0.me4x4[i8x8][3].cost;
2590                             break;
2591                         default:
2592                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2593                             break;
2594                     }
2595                 }
2596             }
2597
2598             if( h->mb.b_chroma_me )
2599             {
2600                 x264_mb_analyse_intra_chroma( h, &analysis );
2601                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2602                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2603                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2604                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2605             }
2606             else
2607                 x264_mb_analyse_intra( h, &analysis, i_cost );
2608
2609             i_satd_inter = i_cost;
2610             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2611                                       analysis.i_satd_i8x8,
2612                                       analysis.i_satd_i4x4 );
2613
2614             if( analysis.i_mbrd )
2615             {
2616                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2617                 i_type = P_L0;
2618                 i_partition = D_16x16;
2619                 i_cost = analysis.l0.i_rd16x16;
2620                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2621                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2622                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2623                 h->mb.i_type = i_type;
2624                 h->mb.i_partition = i_partition;
2625                 if( i_cost < COST_MAX )
2626                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2627                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2628             }
2629
2630             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2631             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2632             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2633             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2634
2635             h->mb.i_type = i_type;
2636
2637             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2638             {
2639                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2640                  * it was an inter block. */
2641                 x264_analyse_update_cache( h, &analysis );
2642                 x264_macroblock_encode( h );
2643                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2644                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2645                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2646                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2647                 goto intra_analysis;
2648             }
2649
2650             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2651             {
2652                 if( IS_INTRA( h->mb.i_type ) )
2653                 {
2654                     x264_intra_rd_refine( h, &analysis );
2655                 }
2656                 else if( i_partition == D_16x16 )
2657                 {
2658                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2659                     analysis.l0.me16x16.cost = i_cost;
2660                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2661                 }
2662                 else if( i_partition == D_16x8 )
2663                 {
2664                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2665                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2666                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2667                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2668                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2669                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2670                 }
2671                 else if( i_partition == D_8x16 )
2672                 {
2673                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2674                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2675                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2676                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2677                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2678                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2679                 }
2680                 else if( i_partition == D_8x8 )
2681                 {
2682                     int i8x8;
2683                     x264_analyse_update_cache( h, &analysis );
2684                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2685                     {
2686                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2687                         {
2688                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2689                         }
2690                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2691                         {
2692                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2693                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2694                         }
2695                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2696                         {
2697                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2698                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2699                         }
2700                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2701                         {
2702                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2703                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2704                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2705                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2706                         }
2707                     }
2708                 }
2709             }
2710         }
2711     }
2712     else if( h->sh.i_type == SLICE_TYPE_B )
2713     {
2714         int i_bskip_cost = COST_MAX;
2715         int b_skip = 0;
2716
2717         if( analysis.i_mbrd )
2718             x264_mb_cache_fenc_satd( h );
2719
2720         h->mb.i_type = B_SKIP;
2721         if( h->mb.b_direct_auto_write )
2722         {
2723             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2724             for( i = 0; i < 2; i++ )
2725             {
2726                 int b_changed = 1;
2727                 h->sh.b_direct_spatial_mv_pred ^= 1;
2728                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2729                 if( analysis.b_direct_available )
2730                 {
2731                     if( b_changed )
2732                     {
2733                         x264_mb_mc( h );
2734                         b_skip = x264_macroblock_probe_bskip( h );
2735                     }
2736                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2737                 }
2738                 else
2739                     b_skip = 0;
2740             }
2741         }
2742         else
2743             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2744
2745         if( analysis.b_direct_available )
2746         {
2747             if( !h->mb.b_direct_auto_write )
2748                 x264_mb_mc( h );
2749             if( analysis.i_mbrd )
2750             {
2751                 i_bskip_cost = ssd_mb( h );
2752                 /* 6 = minimum cavlc cost of a non-skipped MB */
2753                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2754             }
2755             else if( !h->mb.b_direct_auto_write )
2756             {
2757                 /* Conditioning the probe on neighboring block types
2758                  * doesn't seem to help speed or quality. */
2759                 b_skip = x264_macroblock_probe_bskip( h );
2760             }
2761         }
2762
2763         if( !b_skip )
2764         {
2765             const unsigned int flags = h->param.analyse.inter;
2766             int i_type;
2767             int i_partition;
2768             int i_satd_inter;
2769             h->mb.b_skip_mc = 0;
2770
2771             x264_mb_analyse_load_costs( h, &analysis );
2772
2773             /* select best inter mode */
2774             /* direct must be first */
2775             if( analysis.b_direct_available )
2776                 x264_mb_analyse_inter_direct( h, &analysis );
2777
2778             x264_mb_analyse_inter_b16x16( h, &analysis );
2779
2780             i_type = B_L0_L0;
2781             i_partition = D_16x16;
2782             i_cost = analysis.l0.me16x16.cost;
2783             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2784             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2785             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2786
2787             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2788             {
2789                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2790                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2791                     i_bskip_cost < analysis.i_rd16x16bi &&
2792                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2793                     i_bskip_cost < analysis.l1.i_rd16x16 )
2794                 {
2795                     h->mb.i_type = B_SKIP;
2796                     x264_analyse_update_cache( h, &analysis );
2797                     return;
2798                 }
2799             }
2800
2801             if( flags & X264_ANALYSE_BSUB16x16 )
2802             {
2803                 x264_mb_analyse_inter_b8x8( h, &analysis );
2804                 if( analysis.i_cost8x8bi < i_cost )
2805                 {
2806                     i_type = B_8x8;
2807                     i_partition = D_8x8;
2808                     i_cost = analysis.i_cost8x8bi;
2809
2810                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2811                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2812                     {
2813                         x264_mb_analyse_inter_b16x8( h, &analysis );
2814                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2815                                      i_type, analysis.i_mb_type16x8,
2816                                      i_partition, D_16x8 );
2817                     }
2818                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2819                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2820                     {
2821                         x264_mb_analyse_inter_b8x16( h, &analysis );
2822                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2823                                      i_type, analysis.i_mb_type8x16,
2824                                      i_partition, D_8x16 );
2825                     }
2826                 }
2827             }
2828
2829             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2830             {
2831                 /* refine later */
2832             }
2833             /* refine qpel */
2834             else if( i_partition == D_16x16 )
2835             {
2836                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2837                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2838                 if( i_type == B_L0_L0 )
2839                 {
2840                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2841                     i_cost = analysis.l0.me16x16.cost
2842                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2843                 }
2844                 else if( i_type == B_L1_L1 )
2845                 {
2846                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2847                     i_cost = analysis.l1.me16x16.cost
2848                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2849                 }
2850                 else if( i_type == B_BI_BI )
2851                 {
2852                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2853                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2854                 }
2855             }
2856             else if( i_partition == D_16x8 )
2857             {
2858                 for( i=0; i<2; i++ )
2859                 {
2860                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2861                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2862                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2863                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2864                 }
2865             }
2866             else if( i_partition == D_8x16 )
2867             {
2868                 for( i=0; i<2; i++ )
2869                 {
2870                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2871                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2872                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2873                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2874                 }
2875             }
2876             else if( i_partition == D_8x8 )
2877             {
2878                 for( i=0; i<4; i++ )
2879                 {
2880                     x264_me_t *m;
2881                     int i_part_cost_old;
2882                     int i_type_cost;
2883                     int i_part_type = h->mb.i_sub_partition[i];
2884                     int b_bidir = (i_part_type == D_BI_8x8);
2885
2886                     if( i_part_type == D_DIRECT_8x8 )
2887                         continue;
2888                     if( x264_mb_partition_listX_table[0][i_part_type] )
2889                     {
2890                         m = &analysis.l0.me8x8[i];
2891                         i_part_cost_old = m->cost;
2892                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2893                         m->cost -= i_type_cost;
2894                         x264_me_refine_qpel( h, m );
2895                         if( !b_bidir )
2896                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2897                     }
2898                     if( x264_mb_partition_listX_table[1][i_part_type] )
2899                     {
2900                         m = &analysis.l1.me8x8[i];
2901                         i_part_cost_old = m->cost;
2902                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2903                         m->cost -= i_type_cost;
2904                         x264_me_refine_qpel( h, m );
2905                         if( !b_bidir )
2906                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2907                     }
2908                     /* TODO: update mvp? */
2909                 }
2910             }
2911
2912             i_satd_inter = i_cost;
2913
2914             if( analysis.i_mbrd )
2915             {
2916                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2917                 i_type = B_SKIP;
2918                 i_cost = i_bskip_cost;
2919                 i_partition = D_16x16;
2920                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2921                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2922                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2923                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2924                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2925                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2926                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2927
2928                 h->mb.i_type = i_type;
2929                 h->mb.i_partition = i_partition;
2930             }
2931
2932             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2933
2934             if( analysis.i_mbrd )
2935             {
2936                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2937                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2938             }
2939
2940             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2941             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2942             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2943             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2944
2945             h->mb.i_type = i_type;
2946             h->mb.i_partition = i_partition;
2947
2948             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2949                 x264_intra_rd_refine( h, &analysis );
2950             if( h->mb.i_subpel_refine >= 5 )
2951                 x264_refine_bidir( h, &analysis );
2952
2953             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2954             {
2955                 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2956                 x264_analyse_update_cache( h, &analysis );
2957
2958                 if( i_partition == D_16x16 )
2959                 {
2960                     if( i_type == B_L0_L0 )
2961                     {
2962                         analysis.l0.me16x16.cost = i_cost;
2963                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2964                     }
2965                     else if( i_type == B_L1_L1 )
2966                     {
2967                         analysis.l1.me16x16.cost = i_cost;
2968                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2969                     }
2970                     else if( i_type == B_BI_BI )
2971                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
2972                 }
2973                 else if( i_partition == D_16x8 )
2974                 {
2975                     for( i = 0; i < 2; i++ )
2976                     {
2977                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2978                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2979                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2980                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2981                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2982                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2983                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2984                     }
2985                 }
2986                 else if( i_partition == D_8x16 )
2987                 {
2988                     for( i = 0; i < 2; i++ )
2989                     {
2990                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2991                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2992                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2993                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2994                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2995                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2996                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2997                     }
2998                 }
2999                 else if( i_partition == D_8x8 )
3000                 {
3001                     for( i = 0; i < 4; i++ )
3002                     {
3003                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3004                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3005                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3006                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3007                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3008                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3009                     }
3010                 }
3011             }
3012         }
3013     }
3014
3015     x264_analyse_update_cache( h, &analysis );
3016
3017     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3018      * without realizing it.  Check for this and account for it if necessary. */
3019     if( analysis.i_mbrd >= 2 )
3020     {
3021         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3022         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3023         int list = check_mv_lists[h->mb.i_type] - 1;
3024         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3025             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3026             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3027                 h->mb.i_partition = D_16x16;
3028     }
3029
3030     if( !analysis.i_mbrd )
3031         x264_mb_analyse_transform( h );
3032
3033     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3034         x264_mb_analyse_qp_rd( h, &analysis );
3035
3036     h->mb.b_trellis = h->param.analyse.i_trellis;
3037     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3038     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3039         x264_psy_trellis_init( h, 0 );
3040     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3041         h->mb.i_skip_intra = 0;
3042 }
3043
3044 /*-------------------- Update MB from the analysis ----------------------*/
3045 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3046 {
3047     int i;
3048
3049     switch( h->mb.i_type )
3050     {
3051         case I_4x4:
3052             for( i = 0; i < 16; i++ )
3053                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3054
3055             x264_mb_analyse_intra_chroma( h, a );
3056             break;
3057         case I_8x8:
3058             for( i = 0; i < 4; i++ )
3059                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3060
3061             x264_mb_analyse_intra_chroma( h, a );
3062             break;
3063         case I_16x16:
3064             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3065             x264_mb_analyse_intra_chroma( h, a );
3066             break;
3067
3068         case I_PCM:
3069             break;
3070
3071         case P_L0:
3072             switch( h->mb.i_partition )
3073             {
3074                 case D_16x16:
3075                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3076                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3077                     break;
3078
3079                 case D_16x8:
3080                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3081                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3082                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3083                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3084                     break;
3085
3086                 case D_8x16:
3087                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3088                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3089                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3090                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3091                     break;
3092
3093                 default:
3094                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3095                     break;
3096             }
3097             break;
3098
3099         case P_8x8:
3100             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3101             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3102             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3103             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3104             for( i = 0; i < 4; i++ )
3105                 x264_mb_cache_mv_p8x8( h, a, i );
3106             break;
3107
3108         case P_SKIP:
3109         {
3110             h->mb.i_partition = D_16x16;
3111             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3112             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3113             break;
3114         }
3115
3116         case B_SKIP:
3117         case B_DIRECT:
3118             x264_mb_load_mv_direct8x8( h, 0 );
3119             x264_mb_load_mv_direct8x8( h, 1 );
3120             x264_mb_load_mv_direct8x8( h, 2 );
3121             x264_mb_load_mv_direct8x8( h, 3 );
3122             break;
3123
3124         case B_8x8:
3125             /* optimize: cache might not need to be rewritten */
3126             for( i = 0; i < 4; i++ )
3127                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3128             break;
3129
3130         default: /* the rest of the B types */
3131             switch( h->mb.i_partition )
3132             {
3133             case D_16x16:
3134                 switch( h->mb.i_type )
3135                 {
3136                 case B_L0_L0:
3137                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3138                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3139
3140                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3141                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3142                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3143                     break;
3144                 case B_L1_L1:
3145                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3146                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3147                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3148
3149                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3150                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3151                     break;
3152                 case B_BI_BI:
3153                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3154                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3155
3156                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3157                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3158                     break;
3159                 }
3160                 break;
3161             case D_16x8:
3162                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3163                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3164                 break;
3165             case D_8x16:
3166                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3167                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3168                 break;
3169             default:
3170                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3171                 break;
3172             }
3173     }
3174
3175 #ifndef NDEBUG
3176     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3177     {
3178         int l;
3179         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3180         {
3181             int completed;
3182             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3183             if( ref < 0 )
3184                 continue;
3185             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3186             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3187             {
3188                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3189                 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3190                 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3191                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3192                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3193                 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3194                 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3195                 fprintf(stderr, "completed: %d \n", completed );
3196                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3197                 x264_mb_analyse_intra( h, a, COST_MAX );
3198                 h->mb.i_type = I_16x16;
3199                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3200                 x264_mb_analyse_intra_chroma( h, a );
3201             }
3202         }
3203     }
3204 #endif
3205 }
3206
3207 #include "slicetype.c"
3208