git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int i_ref;
  41     int       i_rd16x16;
  42     x264_me_t me16x16;
  43
  44     /* 8x8 */
  45     int       i_cost8x8;
  46     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  47     ALIGNED_4( int16_t mvc[32][5][2] );
  48     x264_me_t me8x8[4];
  49
  50     /* Sub 4x4 */
  51     int       i_cost4x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me4x4[4][4];
  53
  54     /* Sub 8x4 */
  55     int       i_cost8x4[4]; /* cost per 8x8 partition */
  56     x264_me_t me8x4[4][2];
  57
  58     /* Sub 4x8 */
  59     int       i_cost4x8[4]; /* cost per 8x8 partition */
  60     x264_me_t me4x8[4][2];
  61
  62     /* 16x8 */
  63     int       i_cost16x8;
  64     x264_me_t me16x8[2];
  65
  66     /* 8x16 */
  67     int       i_cost8x16;
  68     x264_me_t me8x16[2];
  69
  70 } x264_mb_analysis_list_t;
  71
  72 typedef struct
  73 {
  74     /* conduct the analysis using this lamda and QP */
  75     int i_lambda;
  76     int i_lambda2;
  77     int i_qp;
  78     uint16_t *p_cost_mv;
  79     uint16_t *p_cost_ref0;
  80     uint16_t *p_cost_ref1;
  81     int i_mbrd;
  82
  83
  84     /* I: Intra part */
  85     /* Take some shortcuts in intra search if intra is deemed unlikely */
  86     int b_fast_intra;
  87     int b_try_pskip;
  88
  89     /* Luma part */
  90     int i_satd_i16x16;
  91     int i_satd_i16x16_dir[7];
  92     int i_predict16x16;
  93
  94     int i_satd_i8x8;
  95     int i_cbp_i8x8_luma;
  96     int i_satd_i8x8_dir[12][4];
  97     int i_predict8x8[4];
  98
  99     int i_satd_i4x4;
 100     int i_predict4x4[16];
 101
 102     int i_satd_pcm;
 103
 104     /* Chroma part */
 105     int i_satd_i8x8chroma;
 106     int i_satd_i8x8chroma_dir[4];
 107     int i_predict8x8chroma;
 108
 109     /* II: Inter part P/B frame */
 110     x264_mb_analysis_list_t l0;
 111     x264_mb_analysis_list_t l1;
 112
 113     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 114     int i_cost16x16direct;
 115     int i_cost8x8bi;
 116     int i_cost8x8direct[4];
 117     int i_cost16x8bi;
 118     int i_cost8x16bi;
 119     int i_rd16x16bi;
 120     int i_rd16x16direct;
 121     int i_rd16x8bi;
 122     int i_rd8x16bi;
 123     int i_rd8x8bi;
 124
 125     int i_mb_partition16x8[2]; /* mb_partition_e */
 126     int i_mb_partition8x16[2];
 127     int i_mb_type16x8; /* mb_class_e */
 128     int i_mb_type8x16;
 129
 130     int b_direct_available;
 131
 132 } x264_mb_analysis_t;
 133
 134 /* lambda = pow(2,qp/6-2) */
 135 const int x264_lambda_tab[52] = {
 136    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 137    1, 1, 1, 1,              /*  8-11 */
 138    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 139    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 140    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 141   16,18,20,23,25,29,32,36,  /* 36-43 */
 142   40,45,51,57,64,72,81,91   /* 44-51 */
 143 };
 144
 145 /* lambda2 = pow(lambda,2) * .9 * 256 */
 146 const int x264_lambda2_tab[52] = {
 147     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 148     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 149    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 150   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 151  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 152 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 153 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 154 };
 155
 156 const uint8_t x264_exp2_lut[64] = {
 157       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 158      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 159     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 160     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 161 };
 162
 163 const float x264_log2_lut[128] = {
 164     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 165     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 166     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 167     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 168     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 169     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 170     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 171     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 172     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 173     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 174     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 175     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 176     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 177     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 178     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 179     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 180 };
 181
 182 /* Avoid an int/float conversion. */
 183 const float x264_log2_lz_lut[32] = {
 184     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 185 };
 186
 187 // should the intra and inter lambdas be different?
 188 // I'm just matching the behaviour of deadzone quant.
 189 static const int x264_trellis_lambda2_tab[2][52] = {
 190     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 191     {    46,      58,      73,      92,     117,     147,
 192         185,     233,     294,     370,     466,     587,
 193         740,     932,    1174,    1480,    1864,    2349,
 194        2959,    3728,    4697,    5918,    7457,    9395,
 195       11837,   14914,   18790,   23674,   29828,   37581,
 196       47349,   59656,   75163,   94699,  119313,  150326,
 197      189399,  238627,  300652,  378798,  477255,  601304,
 198      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 199     3030384, 3818045, 4810435, 6060769 },
 200     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 201     {    27,      34,      43,      54,      68,      86,
 202         108,     136,     172,     216,     273,     343,
 203         433,     545,     687,     865,    1090,    1374,
 204        1731,    2180,    2747,    3461,    4361,    5494,
 205        6922,    8721,   10988,   13844,   17442,   21976,
 206       27688,   34885,   43953,   55377,   69771,   87906,
 207      110755,  139543,  175813,  221511,  279087,  351627,
 208      443023,  558174,  703255,  886046, 1116348, 1406511,
 209     1772093, 2232697, 2813022, 3544186 }
 210 };
 211
 212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 213        16,    20,    25,    32,    40,    50,
 214        64,    80,   101,   128,   161,   203,
 215       256,   322,   406,   512,   645,   812,
 216      1024,  1290,  1625,  2048,  2580,  3250,
 217      4096,  5160,  6501,  8192, 10321, 13003,
 218     16384, 20642, 26007, 32768, 41285, 52015,
 219     65535
 220 };
 221
 222 /* TODO: calculate CABAC costs */
 223 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 224     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 225 };
 226 static const int i_mb_b16x8_cost_table[17] = {
 227     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 228 };
 229 static const int i_sub_mb_b_cost_table[13] = {
 230     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 231 };
 232 static const int i_sub_mb_p_cost_table[4] = {
 233     5, 3, 3, 1
 234 };
 235
 236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 237
 238 static uint16_t x264_cost_ref[92][3][33];
 239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 240
 241 int x264_analyse_init_costs( x264_t *h, int qp )
 242 {
 243     int i, j;
 244     int lambda = x264_lambda_tab[qp];
 245     if( h->cost_mv[lambda] )
 246         return 0;
 247     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 248     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 249     h->cost_mv[lambda] += 2*4*2048;
 250     for( i = 0; i <= 2*4*2048; i++ )
 251     {
 252         h->cost_mv[lambda][-i] =
 253         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 254     }
 255     x264_pthread_mutex_lock( &cost_ref_mutex );
 256     for( i = 0; i < 3; i++ )
 257         for( j = 0; j < 33; j++ )
 258             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 259     x264_pthread_mutex_unlock( &cost_ref_mutex );
 260     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 261     {
 262         for( j=0; j<4; j++ )
 263         {
 264             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 265             h->cost_mv_fpel[lambda][j] += 2*2048;
 266             for( i = -2*2048; i < 2*2048; i++ )
 267                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 268         }
 269     }
 270     return 0;
 271 fail:
 272     return -1;
 273 }
 274
 275 void x264_analyse_free_costs( x264_t *h )
 276 {
 277     int i, j;
 278     for( i = 0; i < 92; i++ )
 279     {
 280         if( h->cost_mv[i] )
 281             x264_free( h->cost_mv[i] - 2*4*2048 );
 282         if( h->cost_mv_fpel[i][0] )
 283             for( j = 0; j < 4; j++ )
 284                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 285     }
 286 }
 287
 288 /* initialize an array of lambda*nbits for all possible mvs */
 289 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 290 {
 291     a->p_cost_mv = h->cost_mv[a->i_lambda];
 292     a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 293     a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 294 }
 295
 296 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 297 {
 298     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 299
 300     /* mbrd == 1 -> RD mode decision */
 301     /* mbrd == 2 -> RD refinement */
 302     /* mbrd == 3 -> QPRD */
 303     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 304
 305     /* conduct the analysis using this lamda and QP */
 306     a->i_qp = h->mb.i_qp = i_qp;
 307     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 308
 309     a->i_lambda = x264_lambda_tab[i_qp];
 310     a->i_lambda2 = x264_lambda2_tab[i_qp];
 311
 312     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 313     if( h->param.analyse.i_trellis )
 314     {
 315         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 316         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 317         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 318         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 319     }
 320     h->mb.i_psy_rd_lambda = a->i_lambda;
 321     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 322     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 323
 324     h->mb.i_me_method = h->param.analyse.i_me_method;
 325     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 326     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 327                         && h->mb.i_subpel_refine >= 5;
 328
 329     h->mb.b_transform_8x8 = 0;
 330     h->mb.b_noise_reduction = 0;
 331
 332     /* I: Intra part */
 333     a->i_satd_i16x16 =
 334     a->i_satd_i8x8   =
 335     a->i_satd_i4x4   =
 336     a->i_satd_i8x8chroma = COST_MAX;
 337
 338     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 339     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 340
 341     a->b_fast_intra = 0;
 342     h->mb.i_skip_intra =
 343         h->mb.b_lossless ? 0 :
 344         a->i_mbrd ? 2 :
 345         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 346
 347     /* II: Inter part P/B frame */
 348     if( h->sh.i_type != SLICE_TYPE_I )
 349     {
 350         int i, j;
 351         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 352         // limit motion search to a slightly smaller range than the theoretical limit,
 353         // since the search may go a few iterations past its given range
 354         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 355
 356         /* Calculate max allowed MV range */
 357 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 358         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 359         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 360         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 361         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 362         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 363         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 364         if( h->mb.i_mb_x == 0)
 365         {
 366             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 367             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 368             int thread_mvy_range = i_fmv_range;
 369
 370             if( h->param.i_threads > 1 )
 371             {
 372                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 373                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 374                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 375                 {
 376                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 377                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 378                     for( j=0; j<i_ref; j++ )
 379                     {
 380                         x264_frame_cond_wait( fref[j]->orig, thresh );
 381                         fref[j]->i_lines_completed = fref[j]->orig->i_lines_completed;
 382                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
 383                     }
 384                 }
 385
 386                 if( h->param.b_deterministic )
 387                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 388                 if( h->mb.b_interlaced )
 389                     thread_mvy_range >>= 1;
 390
 391                 for( j=0; j<h->i_ref0; j++ )
 392                 {
 393                     if( h->sh.weight[j][0].weightfn )
 394                     {
 395                         x264_frame_t *frame = h->fref0[j];
 396                         int width = frame->i_width[0] + 2*PADH;
 397                         int i_padv = PADV << h->param.b_interlaced;
 398                         int offset, height;
 399                         uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 400                         int k;
 401                         height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 402                         offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 403                         h->fenc->i_lines_weighted += height;
 404                         if( height )
 405                         {
 406                             for( k = j; k < h->i_ref0; k++ )
 407                                 if( h->sh.weight[k][0].weightfn )
 408                                 {
 409                                     uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 410                                     x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 411                                                              src + offset, frame->i_stride[0],
 412                                                              width, height, &h->sh.weight[k][0] );
 413                                 }
 414                         }
 415                         break;
 416                     }
 417                 }
 418             }
 419
 420             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 421             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 422             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 423             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 424             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 425             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 426             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 427         }
 428 #undef CLIP_FMV
 429
 430         a->l0.me16x16.cost =
 431         a->l0.i_rd16x16    =
 432         a->l0.i_cost8x8    = COST_MAX;
 433
 434         for( i = 0; i < 4; i++ )
 435         {
 436             a->l0.i_cost4x4[i] =
 437             a->l0.i_cost8x4[i] =
 438             a->l0.i_cost4x8[i] = COST_MAX;
 439         }
 440
 441         a->l0.i_cost16x8   =
 442         a->l0.i_cost8x16   = COST_MAX;
 443         if( h->sh.i_type == SLICE_TYPE_B )
 444         {
 445             a->l1.me16x16.cost =
 446             a->l1.i_rd16x16    =
 447             a->l1.i_cost8x8    = COST_MAX;
 448
 449             for( i = 0; i < 4; i++ )
 450             {
 451                 a->l1.i_cost4x4[i] =
 452                 a->l1.i_cost8x4[i] =
 453                 a->l1.i_cost4x8[i] =
 454                 a->i_cost8x8direct[i] = COST_MAX;
 455             }
 456
 457             a->l1.i_cost16x8   =
 458             a->l1.i_cost8x16   =
 459             a->i_rd16x16bi     =
 460             a->i_rd16x16direct =
 461             a->i_rd8x8bi       =
 462             a->i_rd16x8bi      =
 463             a->i_rd8x16bi      =
 464             a->i_cost16x16bi   =
 465             a->i_cost16x16direct =
 466             a->i_cost8x8bi     =
 467             a->i_cost16x8bi    =
 468             a->i_cost8x16bi    = COST_MAX;
 469         }
 470
 471         /* Fast intra decision */
 472         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 473         {
 474             if(   IS_INTRA( h->mb.i_mb_type_left )
 475                || IS_INTRA( h->mb.i_mb_type_top )
 476                || IS_INTRA( h->mb.i_mb_type_topleft )
 477                || IS_INTRA( h->mb.i_mb_type_topright )
 478                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 479                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 480             { /* intra is likely */ }
 481             else
 482             {
 483                 a->b_fast_intra = 1;
 484             }
 485         }
 486         h->mb.b_skip_mc = 0;
 487     }
 488 }
 489
 490
 491
 492 /*
 493  * Handle intra mb
 494  */
 495 /* Max = 4 */
 496 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 497 {
 498     int b_top = i_neighbour & MB_TOP;
 499     int b_left = i_neighbour & MB_LEFT;
 500     if( b_top && b_left )
 501     {
 502         /* top and left available */
 503         *mode++ = I_PRED_16x16_V;
 504         *mode++ = I_PRED_16x16_H;
 505         *mode++ = I_PRED_16x16_DC;
 506         *pi_count = 3;
 507         if( i_neighbour & MB_TOPLEFT )
 508         {
 509             /* top left available*/
 510             *mode++ = I_PRED_16x16_P;
 511             *pi_count = 4;
 512         }
 513     }
 514     else if( b_left )
 515     {
 516         /* left available*/
 517         *mode++ = I_PRED_16x16_DC_LEFT;
 518         *mode++ = I_PRED_16x16_H;
 519         *pi_count = 2;
 520     }
 521     else if( b_top )
 522     {
 523         /* top available*/
 524         *mode++ = I_PRED_16x16_DC_TOP;
 525         *mode++ = I_PRED_16x16_V;
 526         *pi_count = 2;
 527     }
 528     else
 529     {
 530         /* none available */
 531         *mode = I_PRED_16x16_DC_128;
 532         *pi_count = 1;
 533     }
 534 }
 535
 536 /* Max = 4 */
 537 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 538 {
 539     int b_top = i_neighbour & MB_TOP;
 540     int b_left = i_neighbour & MB_LEFT;
 541     if( b_top && b_left )
 542     {
 543         /* top and left available */
 544         *mode++ = I_PRED_CHROMA_V;
 545         *mode++ = I_PRED_CHROMA_H;
 546         *mode++ = I_PRED_CHROMA_DC;
 547         *pi_count = 3;
 548         if( i_neighbour & MB_TOPLEFT )
 549         {
 550             /* top left available */
 551             *mode++ = I_PRED_CHROMA_P;
 552             *pi_count = 4;
 553         }
 554     }
 555     else if( b_left )
 556     {
 557         /* left available*/
 558         *mode++ = I_PRED_CHROMA_DC_LEFT;
 559         *mode++ = I_PRED_CHROMA_H;
 560         *pi_count = 2;
 561     }
 562     else if( b_top )
 563     {
 564         /* top available*/
 565         *mode++ = I_PRED_CHROMA_DC_TOP;
 566         *mode++ = I_PRED_CHROMA_V;
 567         *pi_count = 2;
 568     }
 569     else
 570     {
 571         /* none available */
 572         *mode = I_PRED_CHROMA_DC_128;
 573         *pi_count = 1;
 574     }
 575 }
 576
 577 /* MAX = 9 */
 578 static void predict_4x4_mode_available( unsigned int i_neighbour,
 579                                         int *mode, int *pi_count )
 580 {
 581     int b_top = i_neighbour & MB_TOP;
 582     int b_left = i_neighbour & MB_LEFT;
 583     if( b_top && b_left )
 584     {
 585         *pi_count = 6;
 586         *mode++ = I_PRED_4x4_DC;
 587         *mode++ = I_PRED_4x4_H;
 588         *mode++ = I_PRED_4x4_V;
 589         *mode++ = I_PRED_4x4_DDL;
 590         if( i_neighbour & MB_TOPLEFT )
 591         {
 592             *mode++ = I_PRED_4x4_DDR;
 593             *mode++ = I_PRED_4x4_VR;
 594             *mode++ = I_PRED_4x4_HD;
 595             *pi_count += 3;
 596         }
 597         *mode++ = I_PRED_4x4_VL;
 598         *mode++ = I_PRED_4x4_HU;
 599     }
 600     else if( b_left )
 601     {
 602         *mode++ = I_PRED_4x4_DC_LEFT;
 603         *mode++ = I_PRED_4x4_H;
 604         *mode++ = I_PRED_4x4_HU;
 605         *pi_count = 3;
 606     }
 607     else if( b_top )
 608     {
 609         *mode++ = I_PRED_4x4_DC_TOP;
 610         *mode++ = I_PRED_4x4_V;
 611         *mode++ = I_PRED_4x4_DDL;
 612         *mode++ = I_PRED_4x4_VL;
 613         *pi_count = 4;
 614     }
 615     else
 616     {
 617         *mode++ = I_PRED_4x4_DC_128;
 618         *pi_count = 1;
 619     }
 620 }
 621
 622 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 623 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 624 {
 625     ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
 626     ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
 627     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 628     int i;
 629
 630     if( do_both_dct || h->mb.b_transform_8x8 )
 631     {
 632         h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
 633         for( i = 0; i < 4; i++ )
 634             h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
 635     }
 636     if( do_both_dct || !h->mb.b_transform_8x8 )
 637     {
 638         h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
 639         for( i = 0; i < 16; i++ )
 640             h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
 641     }
 642 }
 643
 644 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
 645 static inline void x264_mb_cache_fenc_satd( x264_t *h )
 646 {
 647     ALIGNED_16( static uint8_t zero[16] ) = {0};
 648     uint8_t *fenc;
 649     int x, y, satd_sum = 0, sa8d_sum = 0;
 650     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 651         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 652     if( !h->mb.i_psy_rd )
 653         return;
 654     for( y = 0; y < 4; y++ )
 655         for( x = 0; x < 4; x++ )
 656         {
 657             fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
 658             h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
 659                                       - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
 660             satd_sum += h->mb.pic.fenc_satd[y][x];
 661         }
 662     for( y = 0; y < 2; y++ )
 663         for( x = 0; x < 2; x++ )
 664         {
 665             fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
 666             h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
 667                                       - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
 668             sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
 669         }
 670     h->mb.pic.fenc_satd_sum = satd_sum;
 671     h->mb.pic.fenc_sa8d_sum = sa8d_sum;
 672 }
 673
 674 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 675 {
 676     int i;
 677
 678     int i_max;
 679     int predict_mode[4];
 680     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 681
 682     uint8_t *p_dstc[2], *p_srcc[2];
 683
 684     if( a->i_satd_i8x8chroma < COST_MAX )
 685         return;
 686
 687     /* 8x8 prediction selection for chroma */
 688     p_dstc[0] = h->mb.pic.p_fdec[1];
 689     p_dstc[1] = h->mb.pic.p_fdec[2];
 690     p_srcc[0] = h->mb.pic.p_fenc[1];
 691     p_srcc[1] = h->mb.pic.p_fenc[2];
 692
 693     predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
 694     a->i_satd_i8x8chroma = COST_MAX;
 695     if( i_max == 4 && b_merged_satd )
 696     {
 697         int satdu[4], satdv[4];
 698         h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
 699         h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
 700         h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
 701         h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
 702         satdu[I_PRED_CHROMA_P] =
 703             h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
 704         satdv[I_PRED_CHROMA_P] =
 705             h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
 706
 707         for( i=0; i<i_max; i++ )
 708         {
 709             int i_mode = predict_mode[i];
 710             int i_satd = satdu[i_mode] + satdv[i_mode]
 711                        + a->i_lambda * bs_size_ue(i_mode);
 712
 713             a->i_satd_i8x8chroma_dir[i] = i_satd;
 714             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 715         }
 716     }
 717     else
 718     {
 719         for( i=0; i<i_max; i++ )
 720         {
 721             int i_satd;
 722             int i_mode = predict_mode[i];
 723
 724             /* we do the prediction */
 725             if( h->mb.b_lossless )
 726                 x264_predict_lossless_8x8_chroma( h, i_mode );
 727             else
 728             {
 729                 h->predict_8x8c[i_mode]( p_dstc[0] );
 730                 h->predict_8x8c[i_mode]( p_dstc[1] );
 731             }
 732
 733             /* we calculate the cost */
 734             i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
 735                                                p_srcc[0], FENC_STRIDE ) +
 736                      h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
 737                                                p_srcc[1], FENC_STRIDE ) +
 738                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 739
 740             a->i_satd_i8x8chroma_dir[i] = i_satd;
 741             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 742         }
 743     }
 744
 745     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 746 }
 747
 748 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 749 {
 750     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 751     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 752     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 753
 754     int i, idx;
 755     int i_max;
 756     int predict_mode[9];
 757     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 758
 759     /*---------------- Try all mode and calculate their score ---------------*/
 760
 761     /* 16x16 prediction selection */
 762     predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
 763
 764     if( b_merged_satd && i_max == 4 )
 765     {
 766         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 767         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 768         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 769             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 770         for( i=0; i<4; i++ )
 771         {
 772             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 773             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 774         }
 775     }
 776     else
 777     {
 778         for( i = 0; i < i_max; i++ )
 779         {
 780             int i_satd;
 781             int i_mode = predict_mode[i];
 782
 783             if( h->mb.b_lossless )
 784                 x264_predict_lossless_16x16( h, i_mode );
 785             else
 786                 h->predict_16x16[i_mode]( p_dst );
 787
 788             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 789                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 790             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 791             a->i_satd_i16x16_dir[i_mode] = i_satd;
 792         }
 793     }
 794
 795     if( h->sh.i_type == SLICE_TYPE_B )
 796         /* cavlc mb type prefix */
 797         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 798     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 799         return;
 800
 801     /* 8x8 prediction selection */
 802     if( flags & X264_ANALYSE_I8x8 )
 803     {
 804         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 805         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 806         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 807         int i_cost = 0;
 808         h->mb.i_cbp_luma = 0;
 809         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 810
 811         // FIXME some bias like in i4x4?
 812         if( h->sh.i_type == SLICE_TYPE_B )
 813             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 814
 815         for( idx = 0;; idx++ )
 816         {
 817             int x = idx&1;
 818             int y = idx>>1;
 819             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 820             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 821             int i_best = COST_MAX;
 822             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 823
 824             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
 825             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 826
 827             if( b_merged_satd && i_max == 9 )
 828             {
 829                 int satd[9];
 830                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 831                 satd[i_pred_mode] -= 3 * a->i_lambda;
 832                 for( i=2; i>=0; i-- )
 833                 {
 834                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 835                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 836                 }
 837                 i = 3;
 838             }
 839             else
 840                 i = 0;
 841
 842             for( ; i<i_max; i++ )
 843             {
 844                 int i_satd;
 845                 int i_mode = predict_mode[i];
 846
 847                 if( h->mb.b_lossless )
 848                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 849                 else
 850                     h->predict_8x8[i_mode]( p_dst_by, edge );
 851
 852                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
 853                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 854                     i_satd -= a->i_lambda * 3;
 855
 856                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 857                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 858             }
 859             i_cost += i_best;
 860
 861             if( idx == 3 || i_cost > i_satd_thresh )
 862                 break;
 863
 864             /* we need to encode this block now (for next ones) */
 865             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 866             x264_mb_encode_i8x8( h, idx, a->i_qp );
 867
 868             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 869         }
 870
 871         if( idx == 3 )
 872         {
 873             a->i_satd_i8x8 = i_cost;
 874             if( h->mb.i_skip_intra )
 875             {
 876                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 877                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 878                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 879                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 880                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 881                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 882                 if( h->mb.i_skip_intra == 2 )
 883                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 884             }
 885         }
 886         else
 887         {
 888             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 889             a->i_satd_i8x8 = COST_MAX;
 890             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 891         }
 892         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 893             return;
 894     }
 895
 896     /* 4x4 prediction selection */
 897     if( flags & X264_ANALYSE_I4x4 )
 898     {
 899         int i_cost;
 900         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 901         h->mb.i_cbp_luma = 0;
 902         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 903         if( a->i_mbrd )
 904             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 905
 906         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 907         if( h->sh.i_type == SLICE_TYPE_B )
 908             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 909
 910         for( idx = 0;; idx++ )
 911         {
 912             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 913             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 914             int i_best = COST_MAX;
 915             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 916
 917             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 918
 919             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 920                 /* emulate missing topright samples */
 921                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 922
 923             if( b_merged_satd && i_max >= 6 )
 924             {
 925                 int satd[9];
 926                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 927                 satd[i_pred_mode] -= 3 * a->i_lambda;
 928                 for( i=2; i>=0; i-- )
 929                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 930                 i = 3;
 931             }
 932             else
 933                 i = 0;
 934
 935             for( ; i<i_max; i++ )
 936             {
 937                 int i_satd;
 938                 int i_mode = predict_mode[i];
 939                 if( h->mb.b_lossless )
 940                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 941                 else
 942                     h->predict_4x4[i_mode]( p_dst_by );
 943
 944                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 945                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 946                     i_satd -= a->i_lambda * 3;
 947
 948                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 949             }
 950             i_cost += i_best + 4 * a->i_lambda;
 951
 952             if( i_cost > i_satd_thresh || idx == 15 )
 953                 break;
 954
 955             /* we need to encode this block now (for next ones) */
 956             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 957             x264_mb_encode_i4x4( h, idx, a->i_qp );
 958
 959             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 960         }
 961         if( idx == 15 )
 962         {
 963             a->i_satd_i4x4 = i_cost;
 964             if( h->mb.i_skip_intra )
 965             {
 966                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 967                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 968                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 969                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 970                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 971                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 972                 if( h->mb.i_skip_intra == 2 )
 973                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 974             }
 975         }
 976         else
 977             a->i_satd_i4x4 = COST_MAX;
 978     }
 979 }
 980
 981 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 982 {
 983     if( a->i_satd_i16x16 <= i_satd_thresh )
 984     {
 985         h->mb.i_type = I_16x16;
 986         x264_analyse_update_cache( h, a );
 987         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 988     }
 989     else
 990         a->i_satd_i16x16 = COST_MAX;
 991
 992     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 993     {
 994         h->mb.i_type = I_4x4;
 995         x264_analyse_update_cache( h, a );
 996         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 997     }
 998     else
 999         a->i_satd_i4x4 = COST_MAX;
1000
1001     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
1002     {
1003         h->mb.i_type = I_8x8;
1004         x264_analyse_update_cache( h, a );
1005         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1006         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1007     }
1008     else
1009         a->i_satd_i8x8 = COST_MAX;
1010 }
1011
1012 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1013 {
1014     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
1015
1016     int i, j, idx, x, y;
1017     int i_max, i_mode, i_thresh;
1018     uint64_t i_satd, i_best;
1019     int predict_mode[9];
1020     h->mb.i_skip_intra = 0;
1021
1022     if( h->mb.i_type == I_16x16 )
1023     {
1024         int old_pred_mode = a->i_predict16x16;
1025         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
1026         i_best = a->i_satd_i16x16;
1027         predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
1028         for( i = 0; i < i_max; i++ )
1029         {
1030             int i_mode = predict_mode[i];
1031             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1032                 continue;
1033             h->mb.i_intra16x16_pred_mode = i_mode;
1034             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1035             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1036         }
1037     }
1038
1039     /* RD selection for chroma prediction */
1040     predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
1041     if( i_max > 1 )
1042     {
1043         i_thresh = a->i_satd_i8x8chroma * 5/4;
1044
1045         for( i = j = 0; i < i_max; i++ )
1046             if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
1047                 predict_mode[i] != a->i_predict8x8chroma )
1048             {
1049                 predict_mode[j++] = predict_mode[i];
1050             }
1051         i_max = j;
1052
1053         if( i_max > 0 )
1054         {
1055             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1056             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1057             /* the previous thing encoded was x264_intra_rd(), so the pixels and
1058              * coefs for the current chroma mode are still around, so we only
1059              * have to recount the bits. */
1060             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1061             for( i = 0; i < i_max; i++ )
1062             {
1063                 i_mode = predict_mode[i];
1064                 if( h->mb.b_lossless )
1065                     x264_predict_lossless_8x8_chroma( h, i_mode );
1066                 else
1067                 {
1068                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1069                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1070                 }
1071                 /* if we've already found a mode that needs no residual, then
1072                  * probably any mode with a residual will be worse.
1073                  * so avoid dct on the remaining modes to improve speed. */
1074                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1075                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1076             }
1077             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1078             h->mb.i_cbp_chroma = i_cbp_chroma_best;
1079         }
1080     }
1081
1082     if( h->mb.i_type == I_4x4 )
1083     {
1084         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1085         int i_nnz = 0;
1086         for( idx = 0; idx < 16; idx++ )
1087         {
1088             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1089             i_best = COST_MAX64;
1090
1091             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1092
1093             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1094                 /* emulate missing topright samples */
1095                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1096
1097             for( i = 0; i < i_max; i++ )
1098             {
1099                 i_mode = predict_mode[i];
1100                 if( h->mb.b_lossless )
1101                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1102                 else
1103                     h->predict_4x4[i_mode]( p_dst_by );
1104                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1105
1106                 if( i_best > i_satd )
1107                 {
1108                     a->i_predict4x4[idx] = i_mode;
1109                     i_best = i_satd;
1110                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1111                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1112                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1113                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1114                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1115                 }
1116             }
1117
1118             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1119             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1120             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1121             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1122             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1123
1124             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1125         }
1126     }
1127     else if( h->mb.i_type == I_8x8 )
1128     {
1129         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1130         for( idx = 0; idx < 4; idx++ )
1131         {
1132             uint64_t pels_h = 0;
1133             uint8_t pels_v[7];
1134             uint16_t i_nnz[2] = {0}; //shut up gcc
1135             uint8_t *p_dst_by;
1136             int j;
1137             int cbp_luma_new = 0;
1138             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1139
1140             i_best = COST_MAX64;
1141             x = idx&1;
1142             y = idx>>1;
1143
1144             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1145             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1146             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1147
1148             for( i = 0; i < i_max; i++ )
1149             {
1150                 i_mode = predict_mode[i];
1151                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1152                     continue;
1153                 if( h->mb.b_lossless )
1154                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1155                 else
1156                     h->predict_8x8[i_mode]( p_dst_by, edge );
1157                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1158                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1159
1160                 if( i_best > i_satd )
1161                 {
1162                     a->i_predict8x8[idx] = i_mode;
1163                     cbp_luma_new = h->mb.i_cbp_luma;
1164                     i_best = i_satd;
1165
1166                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1167                     if( !(idx&1) )
1168                         for( j=0; j<7; j++ )
1169                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1170                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1171                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1172                 }
1173             }
1174             a->i_cbp_i8x8_luma = cbp_luma_new;
1175             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1176             if( !(idx&1) )
1177                 for( j=0; j<7; j++ )
1178                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1179             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1180             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1181
1182             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1183         }
1184     }
1185 }
1186
1187 #define LOAD_FENC( m, src, xoff, yoff) \
1188     (m)->p_cost_mv = a->p_cost_mv; \
1189     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1190     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1191     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1192     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1193     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1194
1195 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1196     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1197     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1198     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1199     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1200     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1201     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1202     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1203     (m)->weight = weight_none; \
1204     (m)->i_ref = ref;
1205
1206 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1207     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1208     (m)->weight = h->sh.weight[i_ref];
1209
1210 #define REF_COST(list, ref) \
1211     (a->p_cost_ref##list[ref])
1212
1213 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1214 {
1215     x264_me_t m;
1216     int i_ref, i_mvc;
1217     ALIGNED_4( int16_t mvc[8][2] );
1218     int i_halfpel_thresh = INT_MAX;
1219     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1220
1221     /* 16x16 Search on all ref frame */
1222     m.i_pixel = PIXEL_16x16;
1223     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1224
1225     a->l0.me16x16.cost = INT_MAX;
1226     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1227     {
1228         const int i_ref_cost = REF_COST( 0, i_ref );
1229         i_halfpel_thresh -= i_ref_cost;
1230         m.i_ref_cost = i_ref_cost;
1231
1232         /* search with ref */
1233         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1234         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1235
1236         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1237         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1238         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1239
1240         /* early termination
1241          * SSD threshold would probably be better than SATD */
1242         if( i_ref == 0
1243             && a->b_try_pskip
1244             && m.cost-m.cost_mv < 300*a->i_lambda
1245             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1246               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1247             && x264_macroblock_probe_pskip( h ) )
1248         {
1249             h->mb.i_type = P_SKIP;
1250             x264_analyse_update_cache( h, a );
1251             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1252             return;
1253         }
1254
1255         m.cost += i_ref_cost;
1256         i_halfpel_thresh += i_ref_cost;
1257
1258         if( m.cost < a->l0.me16x16.cost )
1259             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1260
1261         /* save mv for predicting neighbors */
1262         CP32( a->l0.mvc[i_ref][0], m.mv );
1263         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1264     }
1265
1266     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1267     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1268
1269     h->mb.i_type = P_L0;
1270     if( a->i_mbrd )
1271     {
1272         x264_mb_cache_fenc_satd( h );
1273         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) )
1274         {
1275             h->mb.i_partition = D_16x16;
1276             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1277             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1278             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1279                 h->mb.i_type = P_SKIP;
1280         }
1281     }
1282 }
1283
1284 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1285 {
1286     x264_me_t m;
1287     int i_ref;
1288     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1289     int i_halfpel_thresh = INT_MAX;
1290     int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1291     int i;
1292     int i_maxref = h->mb.pic.i_fref[0]-1;
1293
1294     h->mb.i_partition = D_8x8;
1295
1296     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1297      * than those used by the neighbors */
1298     if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1299         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1300     {
1301         i_maxref = 0;
1302         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1303         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1304         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1305         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1306         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1307         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1308     }
1309
1310     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1311         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1312
1313     for( i = 0; i < 4; i++ )
1314     {
1315         x264_me_t *l0m = &a->l0.me8x8[i];
1316         const int x8 = i%2;
1317         const int y8 = i/2;
1318
1319         m.i_pixel = PIXEL_8x8;
1320
1321         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1322         l0m->cost = INT_MAX;
1323         for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1324         {
1325             const int i_ref_cost = REF_COST( 0, i_ref );
1326             i_halfpel_thresh -= i_ref_cost;
1327             m.i_ref_cost = i_ref_cost;
1328
1329             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1330             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1331
1332             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1333             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1334             x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1335
1336             m.cost += i_ref_cost;
1337             i_halfpel_thresh += i_ref_cost;
1338             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1339
1340             if( m.cost < l0m->cost )
1341                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1342         }
1343         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1344         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1345
1346         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1347            are effectively zero. */
1348         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1349             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1350     }
1351
1352     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1353                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1354     /* P_8x8 ref0 has no ref cost */
1355     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1356                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1357         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1358     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1359     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1360 }
1361
1362 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1363 {
1364     const int i_ref = a->l0.me16x16.i_ref;
1365     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1366     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1367     int i_mvc;
1368     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1369     int i;
1370
1371     /* XXX Needed for x264_mb_predict_mv */
1372     h->mb.i_partition = D_8x8;
1373
1374     i_mvc = 1;
1375     CP32( mvc[0], a->l0.me16x16.mv );
1376
1377     for( i = 0; i < 4; i++ )
1378     {
1379         x264_me_t *m = &a->l0.me8x8[i];
1380         const int x8 = i%2;
1381         const int y8 = i/2;
1382
1383         m->i_pixel = PIXEL_8x8;
1384         m->i_ref_cost = i_ref_cost;
1385
1386         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1387         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1388         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1389
1390         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1391         x264_me_search( h, m, mvc, i_mvc );
1392
1393         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1394
1395         CP32( mvc[i_mvc], m->mv );
1396         i_mvc++;
1397
1398         /* mb type cost */
1399         m->cost += i_ref_cost;
1400         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1401             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1402     }
1403
1404     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1405                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1406     /* theoretically this should include 4*ref_cost,
1407      * but 3 seems a better approximation of cabac. */
1408     if( h->param.b_cabac )
1409         a->l0.i_cost8x8 -= i_ref_cost;
1410     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1411     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1412 }
1413
1414 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1415 {
1416     x264_me_t m;
1417     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1418     ALIGNED_4( int16_t mvc[3][2] );
1419     int i, j;
1420
1421     /* XXX Needed for x264_mb_predict_mv */
1422     h->mb.i_partition = D_16x8;
1423
1424     for( i = 0; i < 2; i++ )
1425     {
1426         x264_me_t *l0m = &a->l0.me16x8[i];
1427         const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1428         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1429
1430         m.i_pixel = PIXEL_16x8;
1431
1432         LOAD_FENC( &m, p_fenc, 0, 8*i );
1433         l0m->cost = INT_MAX;
1434         for( j = 0; j < i_ref8s; j++ )
1435         {
1436             const int i_ref = ref8[j];
1437             const int i_ref_cost = REF_COST( 0, i_ref );
1438             m.i_ref_cost = i_ref_cost;
1439
1440             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1441             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1442             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1443             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1444
1445             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1446             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1447
1448             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1449             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1450             x264_me_search( h, &m, mvc, 3 );
1451
1452             m.cost += i_ref_cost;
1453
1454             if( m.cost < l0m->cost )
1455                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1456         }
1457         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1458         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1459     }
1460
1461     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1462 }
1463
1464 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1465 {
1466     x264_me_t m;
1467     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1468     ALIGNED_4( int16_t mvc[3][2] );
1469     int i, j;
1470
1471     /* XXX Needed for x264_mb_predict_mv */
1472     h->mb.i_partition = D_8x16;
1473
1474     for( i = 0; i < 2; i++ )
1475     {
1476         x264_me_t *l0m = &a->l0.me8x16[i];
1477         const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1478         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1479
1480         m.i_pixel = PIXEL_8x16;
1481
1482         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1483         l0m->cost = INT_MAX;
1484         for( j = 0; j < i_ref8s; j++ )
1485         {
1486             const int i_ref = ref8[j];
1487             const int i_ref_cost = REF_COST( 0, i_ref );
1488             m.i_ref_cost = i_ref_cost;
1489
1490             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1491             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1492             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1493
1494             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1495             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1496
1497             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1498             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1499             x264_me_search( h, &m, mvc, 3 );
1500
1501             m.cost += i_ref_cost;
1502
1503             if( m.cost < l0m->cost )
1504                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1505         }
1506         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1507         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1508     }
1509
1510     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1511 }
1512
1513 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1514 {
1515     ALIGNED_8( uint8_t pix1[16*8] );
1516     uint8_t *pix2 = pix1+8;
1517     const int i_stride = h->mb.pic.i_stride[1];
1518     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1519     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1520     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1521     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1522     x264_weight_t *weight = h->sh.weight[i_ref];
1523
1524 #define CHROMA4x4MC( width, height, me, x, y ) \
1525     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1526     if( weight[1].weightfn ) \
1527         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1528     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1529     if( weight[2].weightfn ) \
1530         weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1531
1532
1533     if( pixel == PIXEL_4x4 )
1534     {
1535         x264_me_t *m = a->l0.me4x4[i8x8];
1536         CHROMA4x4MC( 2,2, m[0], 0,0 );
1537         CHROMA4x4MC( 2,2, m[1], 2,0 );
1538         CHROMA4x4MC( 2,2, m[2], 0,2 );
1539         CHROMA4x4MC( 2,2, m[3], 2,2 );
1540     }
1541     else if( pixel == PIXEL_8x4 )
1542     {
1543         x264_me_t *m = a->l0.me8x4[i8x8];
1544         CHROMA4x4MC( 4,2, m[0], 0,0 );
1545         CHROMA4x4MC( 4,2, m[1], 0,2 );
1546     }
1547     else
1548     {
1549         x264_me_t *m = a->l0.me4x8[i8x8];
1550         CHROMA4x4MC( 2,4, m[0], 0,0 );
1551         CHROMA4x4MC( 2,4, m[1], 2,0 );
1552     }
1553
1554     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1555          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1556 }
1557
1558 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1559 {
1560     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1561     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1562     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1563     int i4x4;
1564
1565     /* XXX Needed for x264_mb_predict_mv */
1566     h->mb.i_partition = D_8x8;
1567
1568     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1569     {
1570         const int idx = 4*i8x8 + i4x4;
1571         const int x4 = block_idx_x[idx];
1572         const int y4 = block_idx_y[idx];
1573         const int i_mvc = (i4x4 == 0);
1574
1575         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1576
1577         m->i_pixel = PIXEL_4x4;
1578
1579         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1580         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1581         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1582
1583         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1584         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1585
1586         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1587     }
1588     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1589                             a->l0.me4x4[i8x8][1].cost +
1590                             a->l0.me4x4[i8x8][2].cost +
1591                             a->l0.me4x4[i8x8][3].cost +
1592                             REF_COST( 0, i_ref ) +
1593                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1594     if( h->mb.b_chroma_me )
1595         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1596 }
1597
1598 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1599 {
1600     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1601     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1602     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1603     int i8x4;
1604
1605     /* XXX Needed for x264_mb_predict_mv */
1606     h->mb.i_partition = D_8x8;
1607
1608     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1609     {
1610         const int idx = 4*i8x8 + 2*i8x4;
1611         const int x4 = block_idx_x[idx];
1612         const int y4 = block_idx_y[idx];
1613         const int i_mvc = (i8x4 == 0);
1614
1615         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1616
1617         m->i_pixel = PIXEL_8x4;
1618
1619         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1620         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1621         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1622
1623         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1624         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1625
1626         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1627     }
1628     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1629                             REF_COST( 0, i_ref ) +
1630                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1631     if( h->mb.b_chroma_me )
1632         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1633 }
1634
1635 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1636 {
1637     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1638     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1639     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1640     int i4x8;
1641
1642     /* XXX Needed for x264_mb_predict_mv */
1643     h->mb.i_partition = D_8x8;
1644
1645     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1646     {
1647         const int idx = 4*i8x8 + i4x8;
1648         const int x4 = block_idx_x[idx];
1649         const int y4 = block_idx_y[idx];
1650         const int i_mvc = (i4x8 == 0);
1651
1652         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1653
1654         m->i_pixel = PIXEL_4x8;
1655
1656         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1657         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1658         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1659
1660         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1661         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1662
1663         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1664     }
1665     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1666                             REF_COST( 0, i_ref ) +
1667                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1668     if( h->mb.b_chroma_me )
1669         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1670 }
1671
1672 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1673 {
1674     /* Assumes that fdec still contains the results of
1675      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1676
1677     uint8_t **p_fenc = h->mb.pic.p_fenc;
1678     uint8_t **p_fdec = h->mb.pic.p_fdec;
1679     int i;
1680
1681     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1682     for( i = 0; i < 4; i++ )
1683     {
1684         const int x = (i&1)*8;
1685         const int y = (i>>1)*8;
1686         a->i_cost16x16direct +=
1687         a->i_cost8x8direct[i] =
1688             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1689
1690         /* mb type cost */
1691         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1692     }
1693 }
1694
1695 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1696 {
1697     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1698     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1699     uint8_t *src0, *src1;
1700     int stride0 = 16, stride1 = 16;
1701
1702     x264_me_t m;
1703     int i_ref, i_mvc;
1704     ALIGNED_4( int16_t mvc[9][2] );
1705     int i_halfpel_thresh = INT_MAX;
1706     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1707
1708     /* 16x16 Search on all ref frame */
1709     m.i_pixel = PIXEL_16x16;
1710     m.weight = weight_none;
1711
1712     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1713
1714     /* ME for List 0 */
1715     a->l0.me16x16.cost = INT_MAX;
1716     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1717     {
1718         /* search with ref */
1719         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1720         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1721         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1722         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1723
1724         /* add ref cost */
1725         m.cost += REF_COST( 0, i_ref );
1726
1727         if( m.cost < a->l0.me16x16.cost )
1728         {
1729             a->l0.i_ref = i_ref;
1730             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1731         }
1732
1733         /* save mv for predicting neighbors */
1734         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1735     }
1736     a->l0.me16x16.i_ref = a->l0.i_ref;
1737
1738     /* subtract ref cost, so we don't have to add it for the other MB types */
1739     a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1740
1741     /* ME for list 1 */
1742     i_halfpel_thresh = INT_MAX;
1743     p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1744     a->l1.me16x16.cost = INT_MAX;
1745     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1746     {
1747         /* search with ref */
1748         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1749         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1750         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1751         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1752
1753         /* add ref cost */
1754         m.cost += REF_COST( 1, i_ref );
1755
1756         if( m.cost < a->l1.me16x16.cost )
1757         {
1758             a->l1.i_ref = i_ref;
1759             h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1760         }
1761
1762         /* save mv for predicting neighbors */
1763         CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1764     }
1765     a->l1.me16x16.i_ref = a->l1.i_ref;
1766
1767     /* subtract ref cost, so we don't have to add it for the other MB types */
1768     a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1769
1770     /* Set global ref, needed for other modes? */
1771     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1772     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1773
1774     /* get cost of BI mode */
1775     src0 = h->mc.get_ref( pix0, &stride0,
1776                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1777                           a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
1778     src1 = h->mc.get_ref( pix1, &stride1,
1779                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1780                           a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
1781
1782     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1783
1784     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1785                      + REF_COST( 0, a->l0.i_ref )
1786                      + REF_COST( 1, a->l1.i_ref )
1787                      + a->l0.me16x16.cost_mv
1788                      + a->l1.me16x16.cost_mv;
1789
1790     /* mb type cost */
1791     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1792     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1793     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1794 }
1795
1796 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1797 {
1798     const int x = 2*(i%2);
1799     const int y = 2*(i/2);
1800
1801     switch( h->mb.i_sub_partition[i] )
1802     {
1803         case D_L0_8x8:
1804             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1805             break;
1806         case D_L0_8x4:
1807             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1808             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1809             break;
1810         case D_L0_4x8:
1811             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1812             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1813             break;
1814         case D_L0_4x4:
1815             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1816             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1817             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1818             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1819             break;
1820         default:
1821             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1822             break;
1823     }
1824 }
1825
1826 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1827     if( x264_mb_partition_listX_table[0][part] ) \
1828     { \
1829         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1830         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1831     } \
1832     else \
1833     { \
1834         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1835         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1836         if( b_mvd ) \
1837             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1838     } \
1839     if( x264_mb_partition_listX_table[1][part] ) \
1840     { \
1841         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1842         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1843     } \
1844     else \
1845     { \
1846         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1847         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1848         if( b_mvd ) \
1849             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1850     }
1851
1852 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1853 {
1854     int x = (i%2)*2;
1855     int y = (i/2)*2;
1856     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1857     {
1858         x264_mb_load_mv_direct8x8( h, i );
1859         if( b_mvd )
1860         {
1861             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1862             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1863             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1864         }
1865     }
1866     else
1867     {
1868         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1869     }
1870 }
1871 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1872 {
1873     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1874 }
1875 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1876 {
1877     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1878 }
1879 #undef CACHE_MV_BI
1880
1881 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1882 {
1883     uint8_t **p_fref[2] =
1884         { h->mb.pic.p_fref[0][a->l0.i_ref],
1885           h->mb.pic.p_fref[1][a->l1.i_ref] };
1886     ALIGNED_8( uint8_t pix[2][8*8] );
1887     int i, l;
1888
1889     /* XXX Needed for x264_mb_predict_mv */
1890     h->mb.i_partition = D_8x8;
1891
1892     a->i_cost8x8bi = 0;
1893
1894     for( i = 0; i < 4; i++ )
1895     {
1896         const int x8 = i%2;
1897         const int y8 = i/2;
1898         int i_part_cost;
1899         int i_part_cost_bi = 0;
1900         int stride[2] = {8,8};
1901         uint8_t *src[2];
1902
1903         for( l = 0; l < 2; l++ )
1904         {
1905             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1906             x264_me_t *m = &lX->me8x8[i];
1907
1908             m->i_pixel = PIXEL_8x8;
1909
1910             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1911             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1912
1913             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1914             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1915
1916             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1917
1918             /* BI mode */
1919             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1920                                     m->mv[0], m->mv[1], 8, 8, weight_none );
1921             i_part_cost_bi += m->cost_mv;
1922             /* FIXME: ref cost */
1923         }
1924         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1925         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1926                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1927         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1928         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1929
1930         i_part_cost = a->l0.me8x8[i].cost;
1931         h->mb.i_sub_partition[i] = D_L0_8x8;
1932         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1933         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1934         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1935         a->i_cost8x8bi += i_part_cost;
1936
1937         /* XXX Needed for x264_mb_predict_mv */
1938         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1939     }
1940
1941     /* mb type cost */
1942     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1943 }
1944
1945 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1946 {
1947     uint8_t **p_fref[2] =
1948         { h->mb.pic.p_fref[0][a->l0.i_ref],
1949           h->mb.pic.p_fref[1][a->l1.i_ref] };
1950     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1951     ALIGNED_4( int16_t mvc[2][2] );
1952     int i, l;
1953
1954     h->mb.i_partition = D_16x8;
1955     a->i_cost16x8bi = 0;
1956
1957     for( i = 0; i < 2; i++ )
1958     {
1959         int i_part_cost;
1960         int i_part_cost_bi = 0;
1961         int stride[2] = {16,16};
1962         uint8_t *src[2];
1963
1964         /* TODO: check only the list(s) that were used in b8x8? */
1965         for( l = 0; l < 2; l++ )
1966         {
1967             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1968             x264_me_t *m = &lX->me16x8[i];
1969
1970             m->i_pixel = PIXEL_16x8;
1971
1972             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1973             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1974
1975             CP32( mvc[0], lX->me8x8[2*i].mv );
1976             CP32( mvc[1], lX->me8x8[2*i+1].mv );
1977
1978             x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1979             x264_me_search( h, m, mvc, 2 );
1980
1981             /* BI mode */
1982             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1983                                     m->mv[0], m->mv[1], 16, 8, weight_none );
1984             /* FIXME: ref cost */
1985             i_part_cost_bi += m->cost_mv;
1986         }
1987         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1988         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1989
1990         i_part_cost = a->l0.me16x8[i].cost;
1991         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1992         if( a->l1.me16x8[i].cost < i_part_cost )
1993         {
1994             i_part_cost = a->l1.me16x8[i].cost;
1995             a->i_mb_partition16x8[i] = D_L1_8x8;
1996         }
1997         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1998         {
1999             i_part_cost = i_part_cost_bi;
2000             a->i_mb_partition16x8[i] = D_BI_8x8;
2001         }
2002         a->i_cost16x8bi += i_part_cost;
2003
2004         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2005     }
2006
2007     /* mb type cost */
2008     a->i_mb_type16x8 = B_L0_L0
2009         + (a->i_mb_partition16x8[0]>>2) * 3
2010         + (a->i_mb_partition16x8[1]>>2);
2011     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2012 }
2013
2014 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2015 {
2016     uint8_t **p_fref[2] =
2017         { h->mb.pic.p_fref[0][a->l0.i_ref],
2018           h->mb.pic.p_fref[1][a->l1.i_ref] };
2019     ALIGNED_8( uint8_t pix[2][8*16] );
2020     ALIGNED_4( int16_t mvc[2][2] );
2021     int i, l;
2022
2023     h->mb.i_partition = D_8x16;
2024     a->i_cost8x16bi = 0;
2025
2026     for( i = 0; i < 2; i++ )
2027     {
2028         int i_part_cost;
2029         int i_part_cost_bi = 0;
2030         int stride[2] = {8,8};
2031         uint8_t *src[2];
2032
2033         for( l = 0; l < 2; l++ )
2034         {
2035             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2036             x264_me_t *m = &lX->me8x16[i];
2037
2038             m->i_pixel = PIXEL_8x16;
2039
2040             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2041             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2042
2043             CP32( mvc[0], lX->me8x8[i].mv );
2044             CP32( mvc[1], lX->me8x8[i+2].mv );
2045
2046             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2047             x264_me_search( h, m, mvc, 2 );
2048
2049             /* BI mode */
2050             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
2051                                     m->mv[0], m->mv[1], 8, 16, weight_none );
2052             /* FIXME: ref cost */
2053             i_part_cost_bi += m->cost_mv;
2054         }
2055
2056         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2057         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2058
2059         i_part_cost = a->l0.me8x16[i].cost;
2060         a->i_mb_partition8x16[i] = D_L0_8x8;
2061         if( a->l1.me8x16[i].cost < i_part_cost )
2062         {
2063             i_part_cost = a->l1.me8x16[i].cost;
2064             a->i_mb_partition8x16[i] = D_L1_8x8;
2065         }
2066         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2067         {
2068             i_part_cost = i_part_cost_bi;
2069             a->i_mb_partition8x16[i] = D_BI_8x8;
2070         }
2071         a->i_cost8x16bi += i_part_cost;
2072
2073         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2074     }
2075
2076     /* mb type cost */
2077     a->i_mb_type8x16 = B_L0_L0
2078         + (a->i_mb_partition8x16[0]>>2) * 3
2079         + (a->i_mb_partition8x16[1]>>2);
2080     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2081 }
2082
2083 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2084 {
2085     int thresh = i_satd * 5/4;
2086
2087     h->mb.i_type = P_L0;
2088     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2089     {
2090         h->mb.i_partition = D_16x16;
2091         x264_analyse_update_cache( h, a );
2092         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2093     }
2094
2095     if( a->l0.i_cost16x8 <= thresh )
2096     {
2097         h->mb.i_partition = D_16x8;
2098         x264_analyse_update_cache( h, a );
2099         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2100     }
2101     else
2102         a->l0.i_cost16x8 = COST_MAX;
2103
2104     if( a->l0.i_cost8x16 <= thresh )
2105     {
2106         h->mb.i_partition = D_8x16;
2107         x264_analyse_update_cache( h, a );
2108         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2109     }
2110     else
2111         a->l0.i_cost8x16 = COST_MAX;
2112
2113     if( a->l0.i_cost8x8 <= thresh )
2114     {
2115         h->mb.i_type = P_8x8;
2116         h->mb.i_partition = D_8x8;
2117         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2118         {
2119             int i;
2120             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2121             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2122             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2123             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2124             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2125              * for future blocks are those left over from previous RDO calls. */
2126             for( i = 0; i < 4; i++ )
2127             {
2128                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2129                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2130                 int subtype, btype = D_L0_8x8;
2131                 uint64_t bcost = COST_MAX64;
2132                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2133                 {
2134                     uint64_t cost;
2135                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2136                         continue;
2137                     h->mb.i_sub_partition[i] = subtype;
2138                     x264_mb_cache_mv_p8x8( h, a, i );
2139                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2140                     COPY2_IF_LT( bcost, cost, btype, subtype );
2141                 }
2142                 h->mb.i_sub_partition[i] = btype;
2143                 x264_mb_cache_mv_p8x8( h, a, i );
2144             }
2145         }
2146         else
2147             x264_analyse_update_cache( h, a );
2148         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2149     }
2150     else
2151         a->l0.i_cost8x8 = COST_MAX;
2152 }
2153
2154 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2155 {
2156     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2157
2158     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2159     {
2160         h->mb.i_type = B_DIRECT;
2161         /* Assumes direct/skip MC is still in fdec */
2162         /* Requires b-rdo to be done before intra analysis */
2163         h->mb.b_skip_mc = 1;
2164         x264_analyse_update_cache( h, a );
2165         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2166         h->mb.b_skip_mc = 0;
2167     }
2168
2169     //FIXME not all the update_cache calls are needed
2170     h->mb.i_partition = D_16x16;
2171     /* L0 */
2172     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2173     {
2174         h->mb.i_type = B_L0_L0;
2175         x264_analyse_update_cache( h, a );
2176         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2177     }
2178
2179     /* L1 */
2180     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2181     {
2182         h->mb.i_type = B_L1_L1;
2183         x264_analyse_update_cache( h, a );
2184         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2185     }
2186
2187     /* BI */
2188     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2189     {
2190         h->mb.i_type = B_BI_BI;
2191         x264_analyse_update_cache( h, a );
2192         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2193     }
2194
2195     /* 8x8 */
2196     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2197     {
2198         h->mb.i_type = B_8x8;
2199         h->mb.i_partition = D_8x8;
2200         x264_analyse_update_cache( h, a );
2201         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2202         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2203     }
2204
2205     /* 16x8 */
2206     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2207     {
2208         h->mb.i_type = a->i_mb_type16x8;
2209         h->mb.i_partition = D_16x8;
2210         x264_analyse_update_cache( h, a );
2211         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2212     }
2213
2214     /* 8x16 */
2215     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2216     {
2217         h->mb.i_type = a->i_mb_type8x16;
2218         h->mb.i_partition = D_8x16;
2219         x264_analyse_update_cache( h, a );
2220         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2221     }
2222 }
2223
2224 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2225 {
2226     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2227     int i;
2228
2229     if( IS_INTRA(h->mb.i_type) )
2230         return;
2231
2232     switch( h->mb.i_partition )
2233     {
2234         case D_16x16:
2235             if( h->mb.i_type == B_BI_BI )
2236                 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2237             break;
2238         case D_16x8:
2239             for( i=0; i<2; i++ )
2240                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2241                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2242             break;
2243         case D_8x16:
2244             for( i=0; i<2; i++ )
2245                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2246                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2247             break;
2248         case D_8x8:
2249             for( i=0; i<4; i++ )
2250                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2251                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2252             break;
2253     }
2254 }
2255
2256 static inline void x264_mb_analyse_transform( x264_t *h )
2257 {
2258     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2259     {
2260         int i_cost4, i_cost8;
2261         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2262         x264_mb_mc( h );
2263
2264         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2265                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2266         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2267                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2268
2269         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2270         h->mb.b_skip_mc = 1;
2271     }
2272 }
2273
2274 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2275 {
2276     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2277     {
2278         int i_rd8;
2279         x264_analyse_update_cache( h, a );
2280         h->mb.b_transform_8x8 ^= 1;
2281         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2282         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2283
2284         if( *i_rd >= i_rd8 )
2285         {
2286             if( *i_rd > 0 )
2287                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2288             *i_rd = i_rd8;
2289         }
2290         else
2291             h->mb.b_transform_8x8 ^= 1;
2292     }
2293 }
2294
2295 /* Rate-distortion optimal QP selection.
2296  * FIXME: More than half of the benefit of this function seems to be
2297  * in the way it improves the coding of chroma DC (by decimating or
2298  * finding a better way to code a single DC coefficient.)
2299  * There must be a more efficient way to get that portion of the benefit
2300  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2301  * trick. */
2302 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2303 {
2304     int bcost, cost, direction, failures, prevcost, origcost;
2305     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2306     int last_qp_tried = 0;
2307     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2308
2309     /* If CBP is already zero, don't raise the quantizer any higher. */
2310     for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2311     {
2312         /* Without psy-RD, require monotonicity when moving quant away from previous
2313          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2314          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2315          * allow 2 failures when moving quant towards previous quant.
2316          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2317         int threshold = (!!h->mb.i_psy_rd);
2318         /* Raise the threshold for failures if we're moving towards the last QP. */
2319         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2320             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2321             threshold++;
2322         h->mb.i_qp = orig_qp;
2323         failures = 0;
2324         prevcost = origcost;
2325         h->mb.i_qp += direction;
2326         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2327         {
2328             if( h->mb.i_last_qp == h->mb.i_qp )
2329                 last_qp_tried = 1;
2330             h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2331             cost = x264_rd_cost_mb( h, a->i_lambda2 );
2332             COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2333
2334             /* We can't assume that the costs are monotonic over QPs.
2335              * Tie case-as-failure seems to give better results. */
2336             if( cost < prevcost )
2337                 failures = 0;
2338             else
2339                 failures++;
2340             prevcost = cost;
2341
2342             if( failures > threshold )
2343                 break;
2344             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2345                 break;
2346             h->mb.i_qp += direction;
2347         }
2348     }
2349
2350     /* Always try the last block's QP. */
2351     if( !last_qp_tried )
2352     {
2353         h->mb.i_qp = h->mb.i_last_qp;
2354         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2355         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2356         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2357     }
2358
2359     h->mb.i_qp = bqp;
2360     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2361
2362     /* Check transform again; decision from before may no longer be optimal. */
2363     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2364         x264_mb_transform_8x8_allowed( h ) )
2365     {
2366         h->mb.b_transform_8x8 ^= 1;
2367         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2368         if( cost > bcost )
2369             h->mb.b_transform_8x8 ^= 1;
2370     }
2371 }
2372
2373 /*****************************************************************************
2374  * x264_macroblock_analyse:
2375  *****************************************************************************/
2376 void x264_macroblock_analyse( x264_t *h )
2377 {
2378     x264_mb_analysis_t analysis;
2379     int i_cost = COST_MAX;
2380     int i;
2381
2382     h->mb.i_qp = x264_ratecontrol_qp( h );
2383     if( h->param.rc.i_aq_mode )
2384     {
2385         x264_adaptive_quant( h );
2386         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2387          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2388         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2389             h->mb.i_qp = h->mb.i_last_qp;
2390     }
2391
2392     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2393
2394     /*--------------------------- Do the analysis ---------------------------*/
2395     if( h->sh.i_type == SLICE_TYPE_I )
2396     {
2397         if( analysis.i_mbrd )
2398             x264_mb_cache_fenc_satd( h );
2399         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2400         if( analysis.i_mbrd )
2401             x264_intra_rd( h, &analysis, COST_MAX );
2402
2403         i_cost = analysis.i_satd_i16x16;
2404         h->mb.i_type = I_16x16;
2405         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2406         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2407         if( analysis.i_satd_pcm < i_cost )
2408             h->mb.i_type = I_PCM;
2409
2410         else if( analysis.i_mbrd >= 2 )
2411             x264_intra_rd_refine( h, &analysis );
2412     }
2413     else if( h->sh.i_type == SLICE_TYPE_P )
2414     {
2415         int b_skip = 0;
2416
2417         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2418
2419         /* Fast P_SKIP detection */
2420         analysis.b_try_pskip = 0;
2421         if( h->param.analyse.b_fast_pskip )
2422         {
2423             if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2424                 // FIXME don't need to check this if the reference frame is done
2425                 {}
2426             else if( h->param.analyse.i_subpel_refine >= 3 )
2427                 analysis.b_try_pskip = 1;
2428             else if( h->mb.i_mb_type_left == P_SKIP ||
2429                      h->mb.i_mb_type_top == P_SKIP ||
2430                      h->mb.i_mb_type_topleft == P_SKIP ||
2431                      h->mb.i_mb_type_topright == P_SKIP )
2432                 b_skip = x264_macroblock_probe_pskip( h );
2433         }
2434
2435         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2436
2437         if( b_skip )
2438         {
2439             h->mb.i_type = P_SKIP;
2440             h->mb.i_partition = D_16x16;
2441             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2442         }
2443         else
2444         {
2445             const unsigned int flags = h->param.analyse.inter;
2446             int i_type;
2447             int i_partition;
2448             int i_thresh16x8;
2449             int i_satd_inter, i_satd_intra;
2450
2451             x264_mb_analyse_load_costs( h, &analysis );
2452
2453             x264_mb_analyse_inter_p16x16( h, &analysis );
2454
2455             if( h->mb.i_type == P_SKIP )
2456                 return;
2457
2458             if( flags & X264_ANALYSE_PSUB16x16 )
2459             {
2460                 if( h->param.analyse.b_mixed_references )
2461                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2462                 else
2463                     x264_mb_analyse_inter_p8x8( h, &analysis );
2464             }
2465
2466             /* Select best inter mode */
2467             i_type = P_L0;
2468             i_partition = D_16x16;
2469             i_cost = analysis.l0.me16x16.cost;
2470
2471             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2472                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2473             {
2474                 i_type = P_8x8;
2475                 i_partition = D_8x8;
2476                 i_cost = analysis.l0.i_cost8x8;
2477
2478                 /* Do sub 8x8 */
2479                 if( flags & X264_ANALYSE_PSUB8x8 )
2480                 {
2481                     for( i = 0; i < 4; i++ )
2482                     {
2483                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2484                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2485                         {
2486                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2487                             h->mb.i_sub_partition[i] = D_L0_4x4;
2488
2489                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2490                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2491                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2492
2493                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2494                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2495                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2496
2497                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2498                         }
2499                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2500                     }
2501                     analysis.l0.i_cost8x8 = i_cost;
2502                 }
2503             }
2504
2505             /* Now do 16x8/8x16 */
2506             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2507             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2508                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2509             {
2510                 x264_mb_analyse_inter_p16x8( h, &analysis );
2511                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2512
2513                 x264_mb_analyse_inter_p8x16( h, &analysis );
2514                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2515             }
2516
2517             h->mb.i_partition = i_partition;
2518
2519             /* refine qpel */
2520             //FIXME mb_type costs?
2521             if( analysis.i_mbrd )
2522             {
2523                 /* refine later */
2524             }
2525             else if( i_partition == D_16x16 )
2526             {
2527                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2528                 i_cost = analysis.l0.me16x16.cost;
2529             }
2530             else if( i_partition == D_16x8 )
2531             {
2532                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2533                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2534                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2535             }
2536             else if( i_partition == D_8x16 )
2537             {
2538                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2539                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2540                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2541             }
2542             else if( i_partition == D_8x8 )
2543             {
2544                 int i8x8;
2545                 i_cost = 0;
2546                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2547                 {
2548                     switch( h->mb.i_sub_partition[i8x8] )
2549                     {
2550                         case D_L0_8x8:
2551                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2552                             i_cost += analysis.l0.me8x8[i8x8].cost;
2553                             break;
2554                         case D_L0_8x4:
2555                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2556                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2557                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2558                                       analysis.l0.me8x4[i8x8][1].cost;
2559                             break;
2560                         case D_L0_4x8:
2561                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2562                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2563                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2564                                       analysis.l0.me4x8[i8x8][1].cost;
2565                             break;
2566
2567                         case D_L0_4x4:
2568                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2569                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2570                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2571                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2572                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2573                                       analysis.l0.me4x4[i8x8][1].cost +
2574                                       analysis.l0.me4x4[i8x8][2].cost +
2575                                       analysis.l0.me4x4[i8x8][3].cost;
2576                             break;
2577                         default:
2578                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2579                             break;
2580                     }
2581                 }
2582             }
2583
2584             if( h->mb.b_chroma_me )
2585             {
2586                 x264_mb_analyse_intra_chroma( h, &analysis );
2587                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2588                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2589                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2590                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2591             }
2592             else
2593                 x264_mb_analyse_intra( h, &analysis, i_cost );
2594
2595             i_satd_inter = i_cost;
2596             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2597                                       analysis.i_satd_i8x8,
2598                                       analysis.i_satd_i4x4 );
2599
2600             if( analysis.i_mbrd )
2601             {
2602                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2603                 i_type = P_L0;
2604                 i_partition = D_16x16;
2605                 i_cost = analysis.l0.i_rd16x16;
2606                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2607                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2608                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2609                 h->mb.i_type = i_type;
2610                 h->mb.i_partition = i_partition;
2611                 if( i_cost < COST_MAX )
2612                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2613                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2614             }
2615
2616             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2617             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2618             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2619             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2620
2621             h->mb.i_type = i_type;
2622
2623             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2624             {
2625                 if( IS_INTRA( h->mb.i_type ) )
2626                 {
2627                     x264_intra_rd_refine( h, &analysis );
2628                 }
2629                 else if( i_partition == D_16x16 )
2630                 {
2631                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2632                     analysis.l0.me16x16.cost = i_cost;
2633                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2634                 }
2635                 else if( i_partition == D_16x8 )
2636                 {
2637                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2638                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2639                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2640                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2641                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2642                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2643                 }
2644                 else if( i_partition == D_8x16 )
2645                 {
2646                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2647                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2648                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2649                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2650                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2651                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2652                 }
2653                 else if( i_partition == D_8x8 )
2654                 {
2655                     int i8x8;
2656                     x264_analyse_update_cache( h, &analysis );
2657                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2658                     {
2659                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2660                         {
2661                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2662                         }
2663                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2664                         {
2665                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2666                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2667                         }
2668                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2669                         {
2670                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2671                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2672                         }
2673                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2674                         {
2675                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2676                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2677                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2678                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2679                         }
2680                     }
2681                 }
2682             }
2683         }
2684     }
2685     else if( h->sh.i_type == SLICE_TYPE_B )
2686     {
2687         int i_bskip_cost = COST_MAX;
2688         int b_skip = 0;
2689
2690         if( analysis.i_mbrd )
2691             x264_mb_cache_fenc_satd( h );
2692
2693         h->mb.i_type = B_SKIP;
2694         if( h->mb.b_direct_auto_write )
2695         {
2696             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2697             for( i = 0; i < 2; i++ )
2698             {
2699                 int b_changed = 1;
2700                 h->sh.b_direct_spatial_mv_pred ^= 1;
2701                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2702                 if( analysis.b_direct_available )
2703                 {
2704                     if( b_changed )
2705                     {
2706                         x264_mb_mc( h );
2707                         b_skip = x264_macroblock_probe_bskip( h );
2708                     }
2709                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2710                 }
2711                 else
2712                     b_skip = 0;
2713             }
2714         }
2715         else
2716             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2717
2718         if( analysis.b_direct_available )
2719         {
2720             if( !h->mb.b_direct_auto_write )
2721                 x264_mb_mc( h );
2722             if( analysis.i_mbrd )
2723             {
2724                 i_bskip_cost = ssd_mb( h );
2725                 /* 6 = minimum cavlc cost of a non-skipped MB */
2726                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2727             }
2728             else if( !h->mb.b_direct_auto_write )
2729             {
2730                 /* Conditioning the probe on neighboring block types
2731                  * doesn't seem to help speed or quality. */
2732                 b_skip = x264_macroblock_probe_bskip( h );
2733             }
2734         }
2735
2736         if( !b_skip )
2737         {
2738             const unsigned int flags = h->param.analyse.inter;
2739             int i_type;
2740             int i_partition;
2741             int i_satd_inter;
2742             h->mb.b_skip_mc = 0;
2743
2744             x264_mb_analyse_load_costs( h, &analysis );
2745
2746             /* select best inter mode */
2747             /* direct must be first */
2748             if( analysis.b_direct_available )
2749                 x264_mb_analyse_inter_direct( h, &analysis );
2750
2751             x264_mb_analyse_inter_b16x16( h, &analysis );
2752
2753             i_type = B_L0_L0;
2754             i_partition = D_16x16;
2755             i_cost = analysis.l0.me16x16.cost;
2756             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2757             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2758             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2759
2760             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2761             {
2762                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2763                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2764                     i_bskip_cost < analysis.i_rd16x16bi &&
2765                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2766                     i_bskip_cost < analysis.l1.i_rd16x16 )
2767                 {
2768                     h->mb.i_type = B_SKIP;
2769                     x264_analyse_update_cache( h, &analysis );
2770                     return;
2771                 }
2772             }
2773
2774             if( flags & X264_ANALYSE_BSUB16x16 )
2775             {
2776                 x264_mb_analyse_inter_b8x8( h, &analysis );
2777                 if( analysis.i_cost8x8bi < i_cost )
2778                 {
2779                     i_type = B_8x8;
2780                     i_partition = D_8x8;
2781                     i_cost = analysis.i_cost8x8bi;
2782
2783                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2784                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2785                     {
2786                         x264_mb_analyse_inter_b16x8( h, &analysis );
2787                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2788                                      i_type, analysis.i_mb_type16x8,
2789                                      i_partition, D_16x8 );
2790                     }
2791                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2792                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2793                     {
2794                         x264_mb_analyse_inter_b8x16( h, &analysis );
2795                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2796                                      i_type, analysis.i_mb_type8x16,
2797                                      i_partition, D_8x16 );
2798                     }
2799                 }
2800             }
2801
2802             if( analysis.i_mbrd )
2803             {
2804                 /* refine later */
2805             }
2806             /* refine qpel */
2807             else if( i_partition == D_16x16 )
2808             {
2809                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2810                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2811                 if( i_type == B_L0_L0 )
2812                 {
2813                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2814                     i_cost = analysis.l0.me16x16.cost
2815                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2816                 }
2817                 else if( i_type == B_L1_L1 )
2818                 {
2819                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2820                     i_cost = analysis.l1.me16x16.cost
2821                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2822                 }
2823                 else if( i_type == B_BI_BI )
2824                 {
2825                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2826                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2827                 }
2828             }
2829             else if( i_partition == D_16x8 )
2830             {
2831                 for( i=0; i<2; i++ )
2832                 {
2833                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2834                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2835                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2836                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2837                 }
2838             }
2839             else if( i_partition == D_8x16 )
2840             {
2841                 for( i=0; i<2; i++ )
2842                 {
2843                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2844                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2845                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2846                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2847                 }
2848             }
2849             else if( i_partition == D_8x8 )
2850             {
2851                 for( i=0; i<4; i++ )
2852                 {
2853                     x264_me_t *m;
2854                     int i_part_cost_old;
2855                     int i_type_cost;
2856                     int i_part_type = h->mb.i_sub_partition[i];
2857                     int b_bidir = (i_part_type == D_BI_8x8);
2858
2859                     if( i_part_type == D_DIRECT_8x8 )
2860                         continue;
2861                     if( x264_mb_partition_listX_table[0][i_part_type] )
2862                     {
2863                         m = &analysis.l0.me8x8[i];
2864                         i_part_cost_old = m->cost;
2865                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2866                         m->cost -= i_type_cost;
2867                         x264_me_refine_qpel( h, m );
2868                         if( !b_bidir )
2869                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2870                     }
2871                     if( x264_mb_partition_listX_table[1][i_part_type] )
2872                     {
2873                         m = &analysis.l1.me8x8[i];
2874                         i_part_cost_old = m->cost;
2875                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2876                         m->cost -= i_type_cost;
2877                         x264_me_refine_qpel( h, m );
2878                         if( !b_bidir )
2879                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2880                     }
2881                     /* TODO: update mvp? */
2882                 }
2883             }
2884
2885             i_satd_inter = i_cost;
2886
2887             if( analysis.i_mbrd )
2888             {
2889                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2890                 i_type = B_SKIP;
2891                 i_cost = i_bskip_cost;
2892                 i_partition = D_16x16;
2893                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2894                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2895                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2896                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2897                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2898                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2899                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2900
2901                 h->mb.i_type = i_type;
2902                 h->mb.i_partition = i_partition;
2903             }
2904
2905             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2906
2907             if( analysis.i_mbrd )
2908             {
2909                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2910                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2911             }
2912
2913             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2914             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2915             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2916             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2917
2918             h->mb.i_type = i_type;
2919             h->mb.i_partition = i_partition;
2920
2921             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2922                 x264_intra_rd_refine( h, &analysis );
2923             if( h->mb.i_subpel_refine >= 5 )
2924                 x264_refine_bidir( h, &analysis );
2925
2926             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2927             {
2928                 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2929                 x264_analyse_update_cache( h, &analysis );
2930
2931                 if( i_partition == D_16x16 )
2932                 {
2933                     if( i_type == B_L0_L0 )
2934                     {
2935                         analysis.l0.me16x16.cost = i_cost;
2936                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2937                     }
2938                     else if( i_type == B_L1_L1 )
2939                     {
2940                         analysis.l1.me16x16.cost = i_cost;
2941                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2942                     }
2943                     else if( i_type == B_BI_BI )
2944                         x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2945                 }
2946                 else if( i_partition == D_16x8 )
2947                 {
2948                     for( i = 0; i < 2; i++ )
2949                     {
2950                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2951                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2952                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2953                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2954                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2955                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2956                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2957                     }
2958                 }
2959                 else if( i_partition == D_8x16 )
2960                 {
2961                     for( i = 0; i < 2; i++ )
2962                     {
2963                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2964                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2965                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2966                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2967                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2968                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2969                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2970                     }
2971                 }
2972                 else if( i_partition == D_8x8 )
2973                 {
2974                     for( i = 0; i < 4; i++ )
2975                     {
2976                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2977                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2978                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2979                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2980                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2981                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2982                     }
2983                 }
2984             }
2985         }
2986     }
2987
2988     x264_analyse_update_cache( h, &analysis );
2989
2990     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
2991      * without realizing it.  Check for this and account for it if necessary. */
2992     if( analysis.i_mbrd >= 2 )
2993     {
2994         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
2995         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
2996         int list = check_mv_lists[h->mb.i_type] - 1;
2997         if( list >= 0 && h->mb.i_partition != D_16x16 &&
2998             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
2999             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3000                 h->mb.i_partition = D_16x16;
3001     }
3002
3003     if( !analysis.i_mbrd )
3004         x264_mb_analyse_transform( h );
3005
3006     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3007         x264_mb_analyse_qp_rd( h, &analysis );
3008
3009     h->mb.b_trellis = h->param.analyse.i_trellis;
3010     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3011     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3012         x264_psy_trellis_init( h, 0 );
3013     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3014         h->mb.i_skip_intra = 0;
3015 }
3016
3017 /*-------------------- Update MB from the analysis ----------------------*/
3018 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3019 {
3020     int i;
3021
3022     switch( h->mb.i_type )
3023     {
3024         case I_4x4:
3025             for( i = 0; i < 16; i++ )
3026                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3027
3028             x264_mb_analyse_intra_chroma( h, a );
3029             break;
3030         case I_8x8:
3031             for( i = 0; i < 4; i++ )
3032                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3033
3034             x264_mb_analyse_intra_chroma( h, a );
3035             break;
3036         case I_16x16:
3037             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3038             x264_mb_analyse_intra_chroma( h, a );
3039             break;
3040
3041         case I_PCM:
3042             break;
3043
3044         case P_L0:
3045             switch( h->mb.i_partition )
3046             {
3047                 case D_16x16:
3048                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3049                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3050                     break;
3051
3052                 case D_16x8:
3053                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3054                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3055                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3056                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3057                     break;
3058
3059                 case D_8x16:
3060                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3061                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3062                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3063                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3064                     break;
3065
3066                 default:
3067                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3068                     break;
3069             }
3070             break;
3071
3072         case P_8x8:
3073             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3074             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3075             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3076             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3077             for( i = 0; i < 4; i++ )
3078                 x264_mb_cache_mv_p8x8( h, a, i );
3079             break;
3080
3081         case P_SKIP:
3082         {
3083             h->mb.i_partition = D_16x16;
3084             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3085             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3086             break;
3087         }
3088
3089         case B_SKIP:
3090         case B_DIRECT:
3091             x264_mb_load_mv_direct8x8( h, 0 );
3092             x264_mb_load_mv_direct8x8( h, 1 );
3093             x264_mb_load_mv_direct8x8( h, 2 );
3094             x264_mb_load_mv_direct8x8( h, 3 );
3095             break;
3096
3097         case B_8x8:
3098             /* optimize: cache might not need to be rewritten */
3099             for( i = 0; i < 4; i++ )
3100                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3101             break;
3102
3103         default: /* the rest of the B types */
3104             switch( h->mb.i_partition )
3105             {
3106             case D_16x16:
3107                 switch( h->mb.i_type )
3108                 {
3109                 case B_L0_L0:
3110                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3111                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3112
3113                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3114                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3115                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3116                     break;
3117                 case B_L1_L1:
3118                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3119                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3120                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3121
3122                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3123                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3124                     break;
3125                 case B_BI_BI:
3126                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3127                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3128
3129                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3130                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3131                     break;
3132                 }
3133                 break;
3134             case D_16x8:
3135                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3136                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3137                 break;
3138             case D_8x16:
3139                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3140                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3141                 break;
3142             default:
3143                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3144                 break;
3145             }
3146     }
3147
3148 #ifndef NDEBUG
3149     if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3150     {
3151         int l;
3152         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3153         {
3154             int completed;
3155             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3156             if( ref < 0 )
3157                 continue;
3158             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3159             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3160             {
3161                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3162                 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3163                 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3164                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3165                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3166                 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3167                 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3168                 fprintf(stderr, "completed: %d \n", completed );
3169                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3170                 x264_mb_analyse_intra( h, a, COST_MAX );
3171                 h->mb.i_type = I_16x16;
3172                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3173                 x264_mb_analyse_intra_chroma( h, a );
3174             }
3175         }
3176     }
3177 #endif
3178 }
3179
3180 #include "slicetype.c"
3181