git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <unistd.h>
  28
  29 #include "common/common.h"
  30 #include "common/cpu.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int i_ref;
  41     int       i_rd16x16;
  42     x264_me_t me16x16;
  43
  44     /* 8x8 */
  45     int       i_cost8x8;
  46     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  47     ALIGNED_4( int16_t mvc[32][5][2] );
  48     x264_me_t me8x8[4];
  49
  50     /* Sub 4x4 */
  51     int       i_cost4x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me4x4[4][4];
  53
  54     /* Sub 8x4 */
  55     int       i_cost8x4[4]; /* cost per 8x8 partition */
  56     x264_me_t me8x4[4][2];
  57
  58     /* Sub 4x8 */
  59     int       i_cost4x8[4]; /* cost per 8x8 partition */
  60     x264_me_t me4x8[4][2];
  61
  62     /* 16x8 */
  63     int       i_cost16x8;
  64     x264_me_t me16x8[2];
  65
  66     /* 8x16 */
  67     int       i_cost8x16;
  68     x264_me_t me8x16[2];
  69
  70 } x264_mb_analysis_list_t;
  71
  72 typedef struct
  73 {
  74     /* conduct the analysis using this lamda and QP */
  75     int i_lambda;
  76     int i_lambda2;
  77     int i_qp;
  78     uint16_t *p_cost_mv;
  79     uint16_t *p_cost_ref[2];
  80     int i_mbrd;
  81
  82
  83     /* I: Intra part */
  84     /* Take some shortcuts in intra search if intra is deemed unlikely */
  85     int b_fast_intra;
  86     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  87     int b_try_pskip;
  88
  89     /* Luma part */
  90     int i_satd_i16x16;
  91     int i_satd_i16x16_dir[7];
  92     int i_predict16x16;
  93
  94     int i_satd_i8x8;
  95     int i_cbp_i8x8_luma;
  96     int i_satd_i8x8_dir[12][4];
  97     int i_predict8x8[4];
  98
  99     int i_satd_i4x4;
 100     int i_predict4x4[16];
 101
 102     int i_satd_pcm;
 103
 104     /* Chroma part */
 105     int i_satd_i8x8chroma;
 106     int i_satd_i8x8chroma_dir[7];
 107     int i_predict8x8chroma;
 108
 109     /* II: Inter part P/B frame */
 110     x264_mb_analysis_list_t l0;
 111     x264_mb_analysis_list_t l1;
 112
 113     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 114     int i_cost16x16direct;
 115     int i_cost8x8bi;
 116     int i_cost8x8direct[4];
 117     int i_cost16x8bi;
 118     int i_cost8x16bi;
 119     int i_rd16x16bi;
 120     int i_rd16x16direct;
 121     int i_rd16x8bi;
 122     int i_rd8x16bi;
 123     int i_rd8x8bi;
 124
 125     int i_mb_partition16x8[2]; /* mb_partition_e */
 126     int i_mb_partition8x16[2];
 127     int i_mb_type16x8; /* mb_class_e */
 128     int i_mb_type8x16;
 129
 130     int b_direct_available;
 131
 132 } x264_mb_analysis_t;
 133
 134 /* lambda = pow(2,qp/6-2) */
 135 const int x264_lambda_tab[52] = {
 136    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 137    1, 1, 1, 1,              /*  8-11 */
 138    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 139    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 140    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 141   16,18,20,23,25,29,32,36,  /* 36-43 */
 142   40,45,51,57,64,72,81,91   /* 44-51 */
 143 };
 144
 145 /* lambda2 = pow(lambda,2) * .9 * 256 */
 146 const int x264_lambda2_tab[52] = {
 147     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 148     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 149    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 150   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 151  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 152 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 153 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 154 };
 155
 156 const uint8_t x264_exp2_lut[64] = {
 157       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 158      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 159     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 160     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 161 };
 162
 163 const float x264_log2_lut[128] = {
 164     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 165     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 166     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 167     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 168     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 169     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 170     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 171     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 172     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 173     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 174     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 175     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 176     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 177     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 178     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 179     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 180 };
 181
 182 /* Avoid an int/float conversion. */
 183 const float x264_log2_lz_lut[32] = {
 184     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 185 };
 186
 187 // should the intra and inter lambdas be different?
 188 // I'm just matching the behaviour of deadzone quant.
 189 static const int x264_trellis_lambda2_tab[2][52] = {
 190     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 191     {    46,      58,      73,      92,     117,     147,
 192         185,     233,     294,     370,     466,     587,
 193         740,     932,    1174,    1480,    1864,    2349,
 194        2959,    3728,    4697,    5918,    7457,    9395,
 195       11837,   14914,   18790,   23674,   29828,   37581,
 196       47349,   59656,   75163,   94699,  119313,  150326,
 197      189399,  238627,  300652,  378798,  477255,  601304,
 198      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 199     3030384, 3818045, 4810435, 6060769 },
 200     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 201     {    27,      34,      43,      54,      68,      86,
 202         108,     136,     172,     216,     273,     343,
 203         433,     545,     687,     865,    1090,    1374,
 204        1731,    2180,    2747,    3461,    4361,    5494,
 205        6922,    8721,   10988,   13844,   17442,   21976,
 206       27688,   34885,   43953,   55377,   69771,   87906,
 207      110755,  139543,  175813,  221511,  279087,  351627,
 208      443023,  558174,  703255,  886046, 1116348, 1406511,
 209     1772093, 2232697, 2813022, 3544186 }
 210 };
 211
 212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 213        16,    20,    25,    32,    40,    50,
 214        64,    80,   101,   128,   161,   203,
 215       256,   322,   406,   512,   645,   812,
 216      1024,  1290,  1625,  2048,  2580,  3250,
 217      4096,  5160,  6501,  8192, 10321, 13003,
 218     16384, 20642, 26007, 32768, 41285, 52015,
 219     65535
 220 };
 221
 222 /* TODO: calculate CABAC costs */
 223 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 224     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 225 };
 226 static const int i_mb_b16x8_cost_table[17] = {
 227     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 228 };
 229 static const int i_sub_mb_b_cost_table[13] = {
 230     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 231 };
 232 static const int i_sub_mb_p_cost_table[4] = {
 233     5, 3, 3, 1
 234 };
 235
 236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 237
 238 static uint16_t x264_cost_ref[92][3][33];
 239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 240
 241 int x264_analyse_init_costs( x264_t *h, int qp )
 242 {
 243     int i, j;
 244     int lambda = x264_lambda_tab[qp];
 245     if( h->cost_mv[lambda] )
 246         return 0;
 247     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 248     CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
 249     h->cost_mv[lambda] += 2*4*2048;
 250     for( i = 0; i <= 2*4*2048; i++ )
 251     {
 252         h->cost_mv[lambda][-i] =
 253         h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 254     }
 255     x264_pthread_mutex_lock( &cost_ref_mutex );
 256     for( i = 0; i < 3; i++ )
 257         for( j = 0; j < 33; j++ )
 258             x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
 259     x264_pthread_mutex_unlock( &cost_ref_mutex );
 260     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
 261     {
 262         for( j=0; j<4; j++ )
 263         {
 264             CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
 265             h->cost_mv_fpel[lambda][j] += 2*2048;
 266             for( i = -2*2048; i < 2*2048; i++ )
 267                 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
 268         }
 269     }
 270     return 0;
 271 fail:
 272     return -1;
 273 }
 274
 275 void x264_analyse_free_costs( x264_t *h )
 276 {
 277     int i, j;
 278     for( i = 0; i < 92; i++ )
 279     {
 280         if( h->cost_mv[i] )
 281             x264_free( h->cost_mv[i] - 2*4*2048 );
 282         if( h->cost_mv_fpel[i][0] )
 283             for( j = 0; j < 4; j++ )
 284                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 285     }
 286 }
 287
 288 void x264_analyse_weight_frame( x264_t *h, int end )
 289 {
 290     int j;
 291     for( j=0; j<h->i_ref0; j++ )
 292     {
 293         if( h->sh.weight[j][0].weightfn )
 294         {
 295             x264_frame_t *frame = h->fref0[j];
 296             int width = frame->i_width[0] + 2*PADH;
 297             int i_padv = PADV << h->param.b_interlaced;
 298             int offset, height;
 299             uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
 300             int k;
 301             height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 302             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 303             h->fenc->i_lines_weighted += height;
 304             if( height )
 305             {
 306                 for( k = j; k < h->i_ref0; k++ )
 307                     if( h->sh.weight[k][0].weightfn )
 308                     {
 309                         uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 310                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 311                                                  src + offset, frame->i_stride[0],
 312                                                  width, height, &h->sh.weight[k][0] );
 313                     }
 314             }
 315             break;
 316         }
 317     }
 318 }
 319
 320 /* initialize an array of lambda*nbits for all possible mvs */
 321 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 322 {
 323     a->p_cost_mv = h->cost_mv[a->i_lambda];
 324     a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 325     a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 326 }
 327
 328 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 329 {
 330     /* conduct the analysis using this lamda and QP */
 331     a->i_qp = h->mb.i_qp = i_qp;
 332     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 333
 334     a->i_lambda = x264_lambda_tab[i_qp];
 335     a->i_lambda2 = x264_lambda2_tab[i_qp];
 336
 337     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 338     if( h->param.analyse.i_trellis )
 339     {
 340         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 341         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 342         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 343         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 344     }
 345     h->mb.i_psy_rd_lambda = a->i_lambda;
 346     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 347     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 348
 349 }
 350
 351 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 352 {
 353     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 354
 355     /* mbrd == 1 -> RD mode decision */
 356     /* mbrd == 2 -> RD refinement */
 357     /* mbrd == 3 -> QPRD */
 358     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 359
 360     x264_mb_analyse_init_qp( h, a, i_qp );
 361
 362     h->mb.i_me_method = h->param.analyse.i_me_method;
 363     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 364     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 365                         && h->mb.i_subpel_refine >= 5;
 366
 367     h->mb.b_transform_8x8 = 0;
 368     h->mb.b_noise_reduction = 0;
 369
 370     /* I: Intra part */
 371     a->i_satd_i16x16 =
 372     a->i_satd_i8x8   =
 373     a->i_satd_i4x4   =
 374     a->i_satd_i8x8chroma = COST_MAX;
 375
 376     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 377     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 378
 379     a->b_fast_intra = 0;
 380     h->mb.i_skip_intra =
 381         h->mb.b_lossless ? 0 :
 382         a->i_mbrd ? 2 :
 383         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 384
 385     /* II: Inter part P/B frame */
 386     if( h->sh.i_type != SLICE_TYPE_I )
 387     {
 388         int i, j;
 389         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 390         // limit motion search to a slightly smaller range than the theoretical limit,
 391         // since the search may go a few iterations past its given range
 392         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 393
 394         /* Calculate max allowed MV range */
 395 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 396         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 397         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 398         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 399         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 400         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 401         {
 402             int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 403             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 404             /* If we're left of the refresh bar, don't reference right of it. */
 405             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 406                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 407         }
 408         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 409         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 410         if( h->mb.i_mb_x == 0 )
 411         {
 412             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 413             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 414             int thread_mvy_range = i_fmv_range;
 415
 416             if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
 417             {
 418                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 419                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 420                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 421                 {
 422                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 423                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 424                     for( j=0; j<i_ref; j++ )
 425                     {
 426                         x264_frame_cond_wait( fref[j]->orig, thresh );
 427                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
 428                     }
 429                 }
 430
 431                 if( h->param.b_deterministic )
 432                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 433                 if( h->mb.b_interlaced )
 434                     thread_mvy_range >>= 1;
 435
 436                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 437             }
 438
 439             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 440             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 441             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 442             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 443             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 444             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 445             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 446         }
 447 #undef CLIP_FMV
 448
 449         a->l0.me16x16.cost =
 450         a->l0.i_rd16x16    =
 451         a->l0.i_cost8x8    = COST_MAX;
 452
 453         for( i = 0; i < 4; i++ )
 454         {
 455             a->l0.i_cost4x4[i] =
 456             a->l0.i_cost8x4[i] =
 457             a->l0.i_cost4x8[i] = COST_MAX;
 458         }
 459
 460         a->l0.i_cost16x8   =
 461         a->l0.i_cost8x16   = COST_MAX;
 462         if( h->sh.i_type == SLICE_TYPE_B )
 463         {
 464             a->l1.me16x16.cost =
 465             a->l1.i_rd16x16    =
 466             a->l1.i_cost8x8    = COST_MAX;
 467
 468             for( i = 0; i < 4; i++ )
 469             {
 470                 a->l1.i_cost4x4[i] =
 471                 a->l1.i_cost8x4[i] =
 472                 a->l1.i_cost4x8[i] =
 473                 a->i_cost8x8direct[i] = COST_MAX;
 474             }
 475
 476             a->l1.i_cost16x8   =
 477             a->l1.i_cost8x16   =
 478             a->i_rd16x16bi     =
 479             a->i_rd16x16direct =
 480             a->i_rd8x8bi       =
 481             a->i_rd16x8bi      =
 482             a->i_rd8x16bi      =
 483             a->i_cost16x16bi   =
 484             a->i_cost16x16direct =
 485             a->i_cost8x8bi     =
 486             a->i_cost16x8bi    =
 487             a->i_cost8x16bi    = COST_MAX;
 488         }
 489
 490         /* Fast intra decision */
 491         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 492         {
 493             if(   IS_INTRA( h->mb.i_mb_type_left )
 494                || IS_INTRA( h->mb.i_mb_type_top )
 495                || IS_INTRA( h->mb.i_mb_type_topleft )
 496                || IS_INTRA( h->mb.i_mb_type_topright )
 497                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 498                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 499             { /* intra is likely */ }
 500             else
 501             {
 502                 a->b_fast_intra = 1;
 503             }
 504         }
 505         h->mb.b_skip_mc = 0;
 506         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 507             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 508         {
 509             a->b_force_intra = 1;
 510             a->b_fast_intra = 0;
 511         }
 512         else
 513             a->b_force_intra = 0;
 514     }
 515 }
 516
 517 /* Prediction modes allowed for various combinations of neighbors. */
 518 /* Terminated by a -1. */
 519 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 520 static const int8_t i16x16_mode_available[5][5] =
 521 {
 522     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 523     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 524     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 525     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 526     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 527 };
 528
 529 static const int8_t i8x8chroma_mode_available[5][5] =
 530 {
 531     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 532     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 533     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 534     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 535     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 536 };
 537
 538 static const int8_t i4x4_mode_available[5][10] =
 539 {
 540     {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 541     {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 542     {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 543     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 544     {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 545 };
 546
 547 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
 548 {
 549     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 550     return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 551 }
 552
 553 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 554 {
 555     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 556     return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
 557 }
 558
 559 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
 560 {
 561     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 562     return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 563 }
 564
 565 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 566 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 567 {
 568     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 569
 570     if( do_both_dct || h->mb.b_transform_8x8 )
 571         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 572     if( do_both_dct || !h->mb.b_transform_8x8 )
 573         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 574 }
 575
 576 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
 577 static inline void x264_mb_cache_fenc_satd( x264_t *h )
 578 {
 579     ALIGNED_16( static uint8_t zero[16] ) = {0};
 580     uint8_t *fenc;
 581     int x, y, satd_sum = 0, sa8d_sum = 0;
 582     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 583         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 584     if( !h->mb.i_psy_rd )
 585         return;
 586     for( y = 0; y < 4; y++ )
 587         for( x = 0; x < 4; x++ )
 588         {
 589             fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
 590             h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
 591                                       - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
 592             satd_sum += h->mb.pic.fenc_satd[y][x];
 593         }
 594     for( y = 0; y < 2; y++ )
 595         for( x = 0; x < 2; x++ )
 596         {
 597             fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
 598             h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
 599                                       - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
 600             sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
 601         }
 602     h->mb.pic.fenc_satd_sum = satd_sum;
 603     h->mb.pic.fenc_sa8d_sum = sa8d_sum;
 604 }
 605
 606 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 607 {
 608     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 609
 610     if( a->i_satd_i8x8chroma < COST_MAX )
 611         return;
 612
 613     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 614
 615     /* 8x8 prediction selection for chroma */
 616     if( predict_mode[3] >= 0 && b_merged_satd )
 617     {
 618         int satdu[4], satdv[4];
 619         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 620         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 621         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 622         h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 623         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 624         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 625
 626         for( ; *predict_mode >= 0; predict_mode++ )
 627         {
 628             int i_mode = *predict_mode;
 629             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 630
 631             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 632             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 633         }
 634     }
 635     else
 636     {
 637         for( ; *predict_mode >= 0; predict_mode++ )
 638         {
 639             int i_satd;
 640             int i_mode = *predict_mode;
 641
 642             /* we do the prediction */
 643             if( h->mb.b_lossless )
 644                 x264_predict_lossless_8x8_chroma( h, i_mode );
 645             else
 646             {
 647                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 648                 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 649             }
 650
 651             /* we calculate the cost */
 652             i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 653                      h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 654                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 655
 656             a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
 657             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 658         }
 659     }
 660
 661     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 662 }
 663
 664 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 665 {
 666     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 667     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 668     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 669
 670     int i, idx;
 671     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 672
 673     /*---------------- Try all mode and calculate their score ---------------*/
 674
 675     /* 16x16 prediction selection */
 676     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 677
 678     if( b_merged_satd && predict_mode[3] >= 0 )
 679     {
 680         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 681         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 682         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 683             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 684         for( i=0; i<4; i++ )
 685         {
 686             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 687             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 688         }
 689     }
 690     else
 691     {
 692         for( ; *predict_mode >= 0; predict_mode++ )
 693         {
 694             int i_satd;
 695             int i_mode = *predict_mode;
 696
 697             if( h->mb.b_lossless )
 698                 x264_predict_lossless_16x16( h, i_mode );
 699             else
 700                 h->predict_16x16[i_mode]( p_dst );
 701
 702             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 703                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 704             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 705             a->i_satd_i16x16_dir[i_mode] = i_satd;
 706         }
 707     }
 708
 709     if( h->sh.i_type == SLICE_TYPE_B )
 710         /* cavlc mb type prefix */
 711         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 712     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 713         return;
 714
 715     /* 8x8 prediction selection */
 716     if( flags & X264_ANALYSE_I8x8 )
 717     {
 718         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 719         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 720         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 721         int i_cost = 0;
 722         h->mb.i_cbp_luma = 0;
 723         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 724
 725         // FIXME some bias like in i4x4?
 726         if( h->sh.i_type == SLICE_TYPE_B )
 727             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 728
 729         for( idx = 0;; idx++ )
 730         {
 731             int x = idx&1;
 732             int y = idx>>1;
 733             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 734             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 735             int i_best = COST_MAX;
 736             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 737
 738             predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
 739             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 740
 741             if( b_merged_satd && predict_mode[8] >= 0 )
 742             {
 743                 int satd[9];
 744                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 745                 satd[i_pred_mode] -= 3 * a->i_lambda;
 746                 for( i=2; i>=0; i-- )
 747                 {
 748                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 749                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 750                 }
 751                 predict_mode += 3;
 752             }
 753
 754             for( ; *predict_mode >= 0; predict_mode++ )
 755             {
 756                 int i_satd;
 757                 int i_mode = *predict_mode;
 758
 759                 if( h->mb.b_lossless )
 760                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 761                 else
 762                     h->predict_8x8[i_mode]( p_dst_by, edge );
 763
 764                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
 765                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 766                     i_satd -= a->i_lambda * 3;
 767
 768                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 769                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 770             }
 771             i_cost += i_best;
 772
 773             if( idx == 3 || i_cost > i_satd_thresh )
 774                 break;
 775
 776             /* we need to encode this block now (for next ones) */
 777             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 778             x264_mb_encode_i8x8( h, idx, a->i_qp );
 779
 780             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 781         }
 782
 783         if( idx == 3 )
 784         {
 785             a->i_satd_i8x8 = i_cost;
 786             if( h->mb.i_skip_intra )
 787             {
 788                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 789                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 790                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 791                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 792                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 793                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 794                 if( h->mb.i_skip_intra == 2 )
 795                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 796             }
 797         }
 798         else
 799         {
 800             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 801             a->i_satd_i8x8 = COST_MAX;
 802             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 803         }
 804         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 805             return;
 806     }
 807
 808     /* 4x4 prediction selection */
 809     if( flags & X264_ANALYSE_I4x4 )
 810     {
 811         int i_cost;
 812         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 813         h->mb.i_cbp_luma = 0;
 814         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 815         if( a->i_mbrd )
 816             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 817
 818         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 819         if( h->sh.i_type == SLICE_TYPE_B )
 820             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 821
 822         for( idx = 0;; idx++ )
 823         {
 824             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 825             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 826             int i_best = COST_MAX;
 827             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 828
 829             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 830
 831             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 832                 /* emulate missing topright samples */
 833                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 834
 835             if( b_merged_satd && predict_mode[5] >= 0 )
 836             {
 837                 int satd[9];
 838                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 839                 satd[i_pred_mode] -= 3 * a->i_lambda;
 840                 for( i=2; i>=0; i-- )
 841                     COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
 842                 predict_mode += 3;
 843             }
 844
 845             for( ; *predict_mode >= 0; predict_mode++ )
 846             {
 847                 int i_satd;
 848                 int i_mode = *predict_mode;
 849
 850                 if( h->mb.b_lossless )
 851                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 852                 else
 853                     h->predict_4x4[i_mode]( p_dst_by );
 854
 855                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 856                 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 857                     i_satd -= a->i_lambda * 3;
 858
 859                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 860             }
 861             i_cost += i_best + 4 * a->i_lambda;
 862
 863             if( i_cost > i_satd_thresh || idx == 15 )
 864                 break;
 865
 866             /* we need to encode this block now (for next ones) */
 867             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 868             x264_mb_encode_i4x4( h, idx, a->i_qp );
 869
 870             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 871         }
 872         if( idx == 15 )
 873         {
 874             a->i_satd_i4x4 = i_cost;
 875             if( h->mb.i_skip_intra )
 876             {
 877                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 878                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 879                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 880                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 881                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 882                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 883                 if( h->mb.i_skip_intra == 2 )
 884                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 885             }
 886         }
 887         else
 888             a->i_satd_i4x4 = COST_MAX;
 889     }
 890 }
 891
 892 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 893 {
 894     if( a->i_satd_i16x16 <= i_satd_thresh )
 895     {
 896         h->mb.i_type = I_16x16;
 897         x264_analyse_update_cache( h, a );
 898         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 899     }
 900     else
 901         a->i_satd_i16x16 = COST_MAX;
 902
 903     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 904     {
 905         h->mb.i_type = I_4x4;
 906         x264_analyse_update_cache( h, a );
 907         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 908     }
 909     else
 910         a->i_satd_i4x4 = COST_MAX;
 911
 912     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 913     {
 914         h->mb.i_type = I_8x8;
 915         x264_analyse_update_cache( h, a );
 916         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 917         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 918     }
 919     else
 920         a->i_satd_i8x8 = COST_MAX;
 921 }
 922
 923 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 924 {
 925     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 926
 927     int i, idx, x, y;
 928     int i_mode, i_thresh;
 929     uint64_t i_satd, i_best;
 930     h->mb.i_skip_intra = 0;
 931
 932     if( h->mb.i_type == I_16x16 )
 933     {
 934         int old_pred_mode = a->i_predict16x16;
 935         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 936         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 937         i_best = a->i_satd_i16x16;
 938         for( ; *predict_mode >= 0; predict_mode++ )
 939         {
 940             int i_mode = *predict_mode;
 941             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 942                 continue;
 943             h->mb.i_intra16x16_pred_mode = i_mode;
 944             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 945             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 946         }
 947     }
 948
 949     /* RD selection for chroma prediction */
 950     const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 951     if( predict_mode[1] >= 0 )
 952     {
 953         int8_t predict_mode_sorted[4];
 954         int i_max;
 955         i_thresh = a->i_satd_i8x8chroma * 5/4;
 956
 957         for( i_max = 0; *predict_mode >= 0; predict_mode++ )
 958         {
 959             i_mode = *predict_mode;
 960             if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
 961                 predict_mode_sorted[i_max++] = i_mode;
 962         }
 963
 964         if( i_max > 0 )
 965         {
 966             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
 967             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
 968             /* the previous thing encoded was x264_intra_rd(), so the pixels and
 969              * coefs for the current chroma mode are still around, so we only
 970              * have to recount the bits. */
 971             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
 972             for( i = 0; i < i_max; i++ )
 973             {
 974                 i_mode = predict_mode_sorted[i];
 975                 if( h->mb.b_lossless )
 976                     x264_predict_lossless_8x8_chroma( h, i_mode );
 977                 else
 978                 {
 979                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 980                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 981                 }
 982                 /* if we've already found a mode that needs no residual, then
 983                  * probably any mode with a residual will be worse.
 984                  * so avoid dct on the remaining modes to improve speed. */
 985                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
 986                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
 987             }
 988             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 989             h->mb.i_cbp_chroma = i_cbp_chroma_best;
 990         }
 991     }
 992
 993     if( h->mb.i_type == I_4x4 )
 994     {
 995         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
 996         int i_nnz = 0;
 997         for( idx = 0; idx < 16; idx++ )
 998         {
 999             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1000             i_best = COST_MAX64;
1001
1002             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1003
1004             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1005                 /* emulate missing topright samples */
1006                 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1007
1008             for( ; *predict_mode >= 0; predict_mode++ )
1009             {
1010                 i_mode = *predict_mode;
1011                 if( h->mb.b_lossless )
1012                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1013                 else
1014                     h->predict_4x4[i_mode]( p_dst_by );
1015                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1016
1017                 if( i_best > i_satd )
1018                 {
1019                     a->i_predict4x4[idx] = i_mode;
1020                     i_best = i_satd;
1021                     pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1022                     pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1023                     pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1024                     pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1025                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1026                 }
1027             }
1028
1029             M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1030             M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1031             M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1032             M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1033             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1034
1035             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1036         }
1037     }
1038     else if( h->mb.i_type == I_8x8 )
1039     {
1040         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1041         for( idx = 0; idx < 4; idx++ )
1042         {
1043             uint64_t pels_h = 0;
1044             uint8_t pels_v[7];
1045             uint16_t i_nnz[2] = {0}; //shut up gcc
1046             uint8_t *p_dst_by;
1047             int j;
1048             int cbp_luma_new = 0;
1049             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1050
1051             i_best = COST_MAX64;
1052             x = idx&1;
1053             y = idx>>1;
1054
1055             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1056             const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1057             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1058
1059             for( ; *predict_mode >= 0; predict_mode++ )
1060             {
1061                 i_mode = *predict_mode;
1062                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1063                     continue;
1064
1065                 if( h->mb.b_lossless )
1066                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1067                 else
1068                     h->predict_8x8[i_mode]( p_dst_by, edge );
1069                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1070                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1071
1072                 if( i_best > i_satd )
1073                 {
1074                     a->i_predict8x8[idx] = i_mode;
1075                     cbp_luma_new = h->mb.i_cbp_luma;
1076                     i_best = i_satd;
1077
1078                     pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1079                     if( !(idx&1) )
1080                         for( j=0; j<7; j++ )
1081                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1082                     i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1083                     i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1084                 }
1085             }
1086             a->i_cbp_i8x8_luma = cbp_luma_new;
1087             M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1088             if( !(idx&1) )
1089                 for( j=0; j<7; j++ )
1090                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1091             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1092             M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1093
1094             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1095         }
1096     }
1097 }
1098
1099 #define LOAD_FENC( m, src, xoff, yoff) \
1100     (m)->p_cost_mv = a->p_cost_mv; \
1101     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1102     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1103     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1104     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1105     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1106
1107 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1108     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1109     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1110     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1111     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1112     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1113     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1114     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115     (m)->weight = weight_none; \
1116     (m)->i_ref = ref;
1117
1118 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1119     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1120     (m)->weight = h->sh.weight[i_ref];
1121
1122 #define REF_COST(list, ref) \
1123     (a->p_cost_ref[list][ref])
1124
1125 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1126 {
1127     x264_me_t m;
1128     int i_ref, i_mvc;
1129     ALIGNED_4( int16_t mvc[8][2] );
1130     int i_halfpel_thresh = INT_MAX;
1131     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1132
1133     /* 16x16 Search on all ref frame */
1134     m.i_pixel = PIXEL_16x16;
1135     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1136
1137     a->l0.me16x16.cost = INT_MAX;
1138     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1139     {
1140         const int i_ref_cost = REF_COST( 0, i_ref );
1141         i_halfpel_thresh -= i_ref_cost;
1142         m.i_ref_cost = i_ref_cost;
1143
1144         /* search with ref */
1145         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1146         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1147
1148         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1149         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1150
1151         if( h->mb.ref_blind_dupe == i_ref )
1152         {
1153             CP32( m.mv, a->l0.mvc[0][0] );
1154             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1155         }
1156         else
1157             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1158
1159         /* early termination
1160          * SSD threshold would probably be better than SATD */
1161         if( i_ref == 0
1162             && a->b_try_pskip
1163             && m.cost-m.cost_mv < 300*a->i_lambda
1164             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1165               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1166             && x264_macroblock_probe_pskip( h ) )
1167         {
1168             h->mb.i_type = P_SKIP;
1169             x264_analyse_update_cache( h, a );
1170             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
1171             return;
1172         }
1173
1174         m.cost += i_ref_cost;
1175         i_halfpel_thresh += i_ref_cost;
1176
1177         if( m.cost < a->l0.me16x16.cost )
1178             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1179
1180         /* save mv for predicting neighbors */
1181         CP32( a->l0.mvc[i_ref][0], m.mv );
1182         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1183     }
1184
1185     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1186     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
1187
1188     h->mb.i_type = P_L0;
1189     if( a->i_mbrd )
1190     {
1191         x264_mb_cache_fenc_satd( h );
1192         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1193         {
1194             h->mb.i_partition = D_16x16;
1195             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1196             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1197             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1198                 h->mb.i_type = P_SKIP;
1199         }
1200     }
1201 }
1202
1203 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1204 {
1205     x264_me_t m;
1206     int i_ref;
1207     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1208     int i_halfpel_thresh = INT_MAX;
1209     int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1210     int i;
1211     int i_maxref = h->mb.pic.i_fref[0]-1;
1212
1213     h->mb.i_partition = D_8x8;
1214
1215     #define CHECK_NEIGHBOUR(i)\
1216     {\
1217         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1218         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1219             i_maxref = ref;\
1220     }
1221
1222     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1223      * than those used by the neighbors */
1224     if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1225         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1226     {
1227         i_maxref = 0;
1228         CHECK_NEIGHBOUR(  -8 - 1 );
1229         CHECK_NEIGHBOUR(  -8 + 0 );
1230         CHECK_NEIGHBOUR(  -8 + 2 );
1231         CHECK_NEIGHBOUR(  -8 + 4 );
1232         CHECK_NEIGHBOUR(   0 - 1 );
1233         CHECK_NEIGHBOUR( 2*8 - 1 );
1234     }
1235
1236     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1237         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1238
1239     for( i = 0; i < 4; i++ )
1240     {
1241         x264_me_t *l0m = &a->l0.me8x8[i];
1242         const int x8 = i%2;
1243         const int y8 = i/2;
1244
1245         m.i_pixel = PIXEL_8x8;
1246
1247         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1248         l0m->cost = INT_MAX;
1249         for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1250         {
1251             const int i_ref_cost = REF_COST( 0, i_ref );
1252             m.i_ref_cost = i_ref_cost;
1253
1254             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1255             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1256
1257             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1258             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1259             if( h->mb.ref_blind_dupe == i_ref )
1260             {
1261                 CP32( m.mv, a->l0.mvc[0][i+1] );
1262                 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1263             }
1264             else
1265                 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1266
1267             m.cost += i_ref_cost;
1268             i_halfpel_thresh += i_ref_cost;
1269             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1270
1271             if( m.cost < l0m->cost )
1272                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1273             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1274                 i_ref = h->mb.ref_blind_dupe;
1275             else
1276                 i_ref++;
1277         }
1278         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1279         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1280
1281         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1282            are effectively zero. */
1283         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1284             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1285     }
1286
1287     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1288                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1289     /* P_8x8 ref0 has no ref cost */
1290     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1291                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1292         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1293     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1294     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1295 }
1296
1297 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1298 {
1299     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1300      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1301      * don't bother analysing the dupes. */
1302     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1303     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1304     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1305     int i_mvc;
1306     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1307     int i;
1308
1309     /* XXX Needed for x264_mb_predict_mv */
1310     h->mb.i_partition = D_8x8;
1311
1312     i_mvc = 1;
1313     CP32( mvc[0], a->l0.me16x16.mv );
1314
1315     for( i = 0; i < 4; i++ )
1316     {
1317         x264_me_t *m = &a->l0.me8x8[i];
1318         const int x8 = i%2;
1319         const int y8 = i/2;
1320
1321         m->i_pixel = PIXEL_8x8;
1322         m->i_ref_cost = i_ref_cost;
1323
1324         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1325         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1326         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1327
1328         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1329         x264_me_search( h, m, mvc, i_mvc );
1330
1331         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1332
1333         CP32( mvc[i_mvc], m->mv );
1334         i_mvc++;
1335
1336         /* mb type cost */
1337         m->cost += i_ref_cost;
1338         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1339             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1340     }
1341
1342     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1343                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1344     /* theoretically this should include 4*ref_cost,
1345      * but 3 seems a better approximation of cabac. */
1346     if( h->param.b_cabac )
1347         a->l0.i_cost8x8 -= i_ref_cost;
1348     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1349     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1350 }
1351
1352 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1353 {
1354     x264_me_t m;
1355     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1356     ALIGNED_4( int16_t mvc[3][2] );
1357     int i, j;
1358
1359     /* XXX Needed for x264_mb_predict_mv */
1360     h->mb.i_partition = D_16x8;
1361
1362     for( i = 0; i < 2; i++ )
1363     {
1364         x264_me_t *l0m = &a->l0.me16x8[i];
1365         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1366         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1367         const int ref8[2] = { minref, maxref };
1368         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1369
1370         m.i_pixel = PIXEL_16x8;
1371
1372         LOAD_FENC( &m, p_fenc, 0, 8*i );
1373         l0m->cost = INT_MAX;
1374         for( j = 0; j < i_ref8s; j++ )
1375         {
1376             const int i_ref = ref8[j];
1377             const int i_ref_cost = REF_COST( 0, i_ref );
1378             m.i_ref_cost = i_ref_cost;
1379
1380             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1381             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1382             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1383             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1384
1385             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1386             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1387
1388             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1389             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1390             /* We can only take this shortcut if the first search was performed on ref0. */
1391             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1392             {
1393                 /* We can just leave the MV from the previous ref search. */
1394                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1395             }
1396             else
1397                 x264_me_search( h, &m, mvc, 3 );
1398
1399             m.cost += i_ref_cost;
1400
1401             if( m.cost < l0m->cost )
1402                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1403         }
1404         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1405         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1406     }
1407
1408     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1409 }
1410
1411 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1412 {
1413     x264_me_t m;
1414     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1415     ALIGNED_4( int16_t mvc[3][2] );
1416     int i, j;
1417
1418     /* XXX Needed for x264_mb_predict_mv */
1419     h->mb.i_partition = D_8x16;
1420
1421     for( i = 0; i < 2; i++ )
1422     {
1423         x264_me_t *l0m = &a->l0.me8x16[i];
1424         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1425         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1426         const int ref8[2] = { minref, maxref };
1427         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1428
1429         m.i_pixel = PIXEL_8x16;
1430
1431         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1432         l0m->cost = INT_MAX;
1433         for( j = 0; j < i_ref8s; j++ )
1434         {
1435             const int i_ref = ref8[j];
1436             const int i_ref_cost = REF_COST( 0, i_ref );
1437             m.i_ref_cost = i_ref_cost;
1438
1439             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1440             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1441             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1442
1443             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1444             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1445
1446             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1447             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1448             /* We can only take this shortcut if the first search was performed on ref0. */
1449             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1450             {
1451                 /* We can just leave the MV from the previous ref search. */
1452                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1453             }
1454             else
1455                 x264_me_search( h, &m, mvc, 3 );
1456
1457             m.cost += i_ref_cost;
1458
1459             if( m.cost < l0m->cost )
1460                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1461         }
1462         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1463         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1464     }
1465
1466     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1467 }
1468
1469 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1470 {
1471     ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1472     uint8_t *pix2 = pix1+8;
1473     const int i_stride = h->mb.pic.i_stride[1];
1474     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1475     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1476     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1477     const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1478     x264_weight_t *weight = h->sh.weight[i_ref];
1479
1480 #define CHROMA4x4MC( width, height, me, x, y ) \
1481     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1482     if( weight[1].weightfn ) \
1483         weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1484     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1485     if( weight[2].weightfn ) \
1486         weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1487
1488
1489     if( pixel == PIXEL_4x4 )
1490     {
1491         x264_me_t *m = a->l0.me4x4[i8x8];
1492         CHROMA4x4MC( 2,2, m[0], 0,0 );
1493         CHROMA4x4MC( 2,2, m[1], 2,0 );
1494         CHROMA4x4MC( 2,2, m[2], 0,2 );
1495         CHROMA4x4MC( 2,2, m[3], 2,2 );
1496     }
1497     else if( pixel == PIXEL_8x4 )
1498     {
1499         x264_me_t *m = a->l0.me8x4[i8x8];
1500         CHROMA4x4MC( 4,2, m[0], 0,0 );
1501         CHROMA4x4MC( 4,2, m[1], 0,2 );
1502     }
1503     else
1504     {
1505         x264_me_t *m = a->l0.me4x8[i8x8];
1506         CHROMA4x4MC( 2,4, m[0], 0,0 );
1507         CHROMA4x4MC( 2,4, m[1], 2,0 );
1508     }
1509
1510     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1511          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1512 }
1513
1514 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1515 {
1516     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1517     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1518     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1519     int i4x4;
1520
1521     /* XXX Needed for x264_mb_predict_mv */
1522     h->mb.i_partition = D_8x8;
1523
1524     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1525     {
1526         const int idx = 4*i8x8 + i4x4;
1527         const int x4 = block_idx_x[idx];
1528         const int y4 = block_idx_y[idx];
1529         const int i_mvc = (i4x4 == 0);
1530
1531         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1532
1533         m->i_pixel = PIXEL_4x4;
1534
1535         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1536         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1537         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1538
1539         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1540         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1541
1542         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1543     }
1544     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1545                             a->l0.me4x4[i8x8][1].cost +
1546                             a->l0.me4x4[i8x8][2].cost +
1547                             a->l0.me4x4[i8x8][3].cost +
1548                             REF_COST( 0, i_ref ) +
1549                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1550     if( h->mb.b_chroma_me )
1551         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1552 }
1553
1554 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1555 {
1556     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1557     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1558     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1559     int i8x4;
1560
1561     /* XXX Needed for x264_mb_predict_mv */
1562     h->mb.i_partition = D_8x8;
1563
1564     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1565     {
1566         const int idx = 4*i8x8 + 2*i8x4;
1567         const int x4 = block_idx_x[idx];
1568         const int y4 = block_idx_y[idx];
1569         const int i_mvc = (i8x4 == 0);
1570
1571         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1572
1573         m->i_pixel = PIXEL_8x4;
1574
1575         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1576         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1577         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1578
1579         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1580         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1581
1582         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1583     }
1584     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1585                             REF_COST( 0, i_ref ) +
1586                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1587     if( h->mb.b_chroma_me )
1588         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1589 }
1590
1591 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1592 {
1593     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1594     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1595     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1596     int i4x8;
1597
1598     /* XXX Needed for x264_mb_predict_mv */
1599     h->mb.i_partition = D_8x8;
1600
1601     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1602     {
1603         const int idx = 4*i8x8 + i4x8;
1604         const int x4 = block_idx_x[idx];
1605         const int y4 = block_idx_y[idx];
1606         const int i_mvc = (i4x8 == 0);
1607
1608         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1609
1610         m->i_pixel = PIXEL_4x8;
1611
1612         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1613         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1614         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1615
1616         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1617         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1618
1619         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1620     }
1621     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1622                             REF_COST( 0, i_ref ) +
1623                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1624     if( h->mb.b_chroma_me )
1625         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1626 }
1627
1628 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1629 {
1630     /* Assumes that fdec still contains the results of
1631      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1632
1633     uint8_t **p_fenc = h->mb.pic.p_fenc;
1634     uint8_t **p_fdec = h->mb.pic.p_fdec;
1635     int i;
1636
1637     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1638     for( i = 0; i < 4; i++ )
1639     {
1640         const int x = (i&1)*8;
1641         const int y = (i>>1)*8;
1642         a->i_cost16x16direct +=
1643         a->i_cost8x8direct[i] =
1644             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1645
1646         /* mb type cost */
1647         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1648     }
1649 }
1650
1651 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1652 {
1653     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1654     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1655     uint8_t *src0, *src1;
1656     int stride0 = 16, stride1 = 16;
1657
1658     x264_me_t m;
1659     int i_ref, i_mvc;
1660     ALIGNED_4( int16_t mvc[9][2] );
1661     int i_halfpel_thresh = INT_MAX;
1662     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1663
1664     /* 16x16 Search on all ref frame */
1665     m.i_pixel = PIXEL_16x16;
1666     m.weight = weight_none;
1667
1668     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1669
1670     /* ME for List 0 */
1671     a->l0.me16x16.cost = INT_MAX;
1672     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1673     {
1674         const int i_ref_cost = REF_COST( 0, i_ref );
1675         m.i_ref_cost = i_ref_cost;
1676         /* search with ref */
1677         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1678         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1679         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1680         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1681
1682         /* add ref cost */
1683         m.cost += i_ref_cost;
1684
1685         if( m.cost < a->l0.me16x16.cost )
1686         {
1687             a->l0.i_ref = i_ref;
1688             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1689         }
1690
1691         /* save mv for predicting neighbors */
1692         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1693     }
1694     a->l0.me16x16.i_ref = a->l0.i_ref;
1695
1696     /* ME for list 1 */
1697     i_halfpel_thresh = INT_MAX;
1698     p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1699     a->l1.me16x16.cost = INT_MAX;
1700     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1701     {
1702         const int i_ref_cost = REF_COST( 0, i_ref );
1703         m.i_ref_cost = i_ref_cost;
1704         /* search with ref */
1705         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1706         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1707         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1708         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1709
1710         /* add ref cost */
1711         m.cost += i_ref_cost;
1712
1713         if( m.cost < a->l1.me16x16.cost )
1714         {
1715             a->l1.i_ref = i_ref;
1716             h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1717         }
1718
1719         /* save mv for predicting neighbors */
1720         CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1721     }
1722     a->l1.me16x16.i_ref = a->l1.i_ref;
1723
1724     /* get cost of BI mode */
1725     src0 = h->mc.get_ref( pix0, &stride0,
1726                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1727                           a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
1728     src1 = h->mc.get_ref( pix1, &stride1,
1729                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1730                           a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
1731
1732     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1733
1734     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1735                      + REF_COST( 0, a->l0.i_ref )
1736                      + REF_COST( 1, a->l1.i_ref )
1737                      + a->l0.me16x16.cost_mv
1738                      + a->l1.me16x16.cost_mv;
1739
1740     /* mb type cost */
1741     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1742     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1743     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1744 }
1745
1746 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1747 {
1748     const int x = 2*(i%2);
1749     const int y = 2*(i/2);
1750
1751     switch( h->mb.i_sub_partition[i] )
1752     {
1753         case D_L0_8x8:
1754             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1755             break;
1756         case D_L0_8x4:
1757             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1758             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1759             break;
1760         case D_L0_4x8:
1761             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1762             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1763             break;
1764         case D_L0_4x4:
1765             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1766             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1767             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1768             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1769             break;
1770         default:
1771             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1772             break;
1773     }
1774 }
1775
1776 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1777     if( x264_mb_partition_listX_table[0][part] ) \
1778     { \
1779         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1780         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1781     } \
1782     else \
1783     { \
1784         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1785         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1786         if( b_mvd ) \
1787             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1788     } \
1789     if( x264_mb_partition_listX_table[1][part] ) \
1790     { \
1791         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1792         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1793     } \
1794     else \
1795     { \
1796         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1797         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1798         if( b_mvd ) \
1799             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1800     }
1801
1802 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1803 {
1804     int x = (i%2)*2;
1805     int y = (i/2)*2;
1806     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1807     {
1808         x264_mb_load_mv_direct8x8( h, i );
1809         if( b_mvd )
1810         {
1811             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1812             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1813             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1814         }
1815     }
1816     else
1817     {
1818         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1819     }
1820 }
1821 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1822 {
1823     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1824 }
1825 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1826 {
1827     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1828 }
1829 #undef CACHE_MV_BI
1830
1831 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1832 {
1833     uint8_t **p_fref[2] =
1834         { h->mb.pic.p_fref[0][a->l0.i_ref],
1835           h->mb.pic.p_fref[1][a->l1.i_ref] };
1836     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1837     int i, l;
1838
1839     /* XXX Needed for x264_mb_predict_mv */
1840     h->mb.i_partition = D_8x8;
1841
1842     a->i_cost8x8bi = 0;
1843
1844     for( i = 0; i < 4; i++ )
1845     {
1846         const int x8 = i%2;
1847         const int y8 = i/2;
1848         int i_part_cost;
1849         int i_part_cost_bi = 0;
1850         int stride[2] = {8,8};
1851         uint8_t *src[2];
1852
1853         for( l = 0; l < 2; l++ )
1854         {
1855             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1856             const int i_ref_cost = REF_COST( l, lX->i_ref );
1857             x264_me_t *m = &lX->me8x8[i];
1858
1859             m->i_pixel = PIXEL_8x8;
1860             m->i_ref_cost = i_ref_cost;
1861
1862             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1863             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1864
1865             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1866             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1867             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1868             m->cost += i_ref_cost;
1869
1870             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1871
1872             /* BI mode */
1873             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1874                                     m->mv[0], m->mv[1], 8, 8, weight_none );
1875             i_part_cost_bi += m->cost_mv + i_ref_cost;
1876         }
1877         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1878         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1879                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1880         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1881         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1882
1883         i_part_cost = a->l0.me8x8[i].cost;
1884         h->mb.i_sub_partition[i] = D_L0_8x8;
1885         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1886         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1887         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1888         a->i_cost8x8bi += i_part_cost;
1889
1890         /* XXX Needed for x264_mb_predict_mv */
1891         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1892     }
1893
1894     /* mb type cost */
1895     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1896 }
1897
1898 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1899 {
1900     uint8_t **p_fref[2] =
1901         { h->mb.pic.p_fref[0][a->l0.i_ref],
1902           h->mb.pic.p_fref[1][a->l1.i_ref] };
1903     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1904     ALIGNED_4( int16_t mvc[2][2] );
1905     int i, l;
1906
1907     h->mb.i_partition = D_16x8;
1908     a->i_cost16x8bi = 0;
1909
1910     for( i = 0; i < 2; i++ )
1911     {
1912         int i_part_cost;
1913         int i_part_cost_bi = 0;
1914         int stride[2] = {16,16};
1915         uint8_t *src[2];
1916
1917         /* TODO: check only the list(s) that were used in b8x8? */
1918         for( l = 0; l < 2; l++ )
1919         {
1920             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1921             const int i_ref_cost = REF_COST( l, lX->i_ref );
1922             x264_me_t *m = &lX->me16x8[i];
1923
1924             m->i_pixel = PIXEL_16x8;
1925             m->i_ref_cost = i_ref_cost;
1926
1927             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1928             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1929
1930             CP32( mvc[0], lX->me8x8[2*i].mv );
1931             CP32( mvc[1], lX->me8x8[2*i+1].mv );
1932
1933             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1934             x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1935             x264_me_search( h, m, mvc, 2 );
1936             m->cost += i_ref_cost;
1937
1938             /* BI mode */
1939             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1940                                     m->mv[0], m->mv[1], 16, 8, weight_none );
1941             i_part_cost_bi += m->cost_mv + i_ref_cost;
1942         }
1943         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1944         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1945
1946         i_part_cost = a->l0.me16x8[i].cost;
1947         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1948         if( a->l1.me16x8[i].cost < i_part_cost )
1949         {
1950             i_part_cost = a->l1.me16x8[i].cost;
1951             a->i_mb_partition16x8[i] = D_L1_8x8;
1952         }
1953         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1954         {
1955             i_part_cost = i_part_cost_bi;
1956             a->i_mb_partition16x8[i] = D_BI_8x8;
1957         }
1958         a->i_cost16x8bi += i_part_cost;
1959
1960         x264_mb_cache_mv_b16x8( h, a, i, 0 );
1961     }
1962
1963     /* mb type cost */
1964     a->i_mb_type16x8 = B_L0_L0
1965         + (a->i_mb_partition16x8[0]>>2) * 3
1966         + (a->i_mb_partition16x8[1]>>2);
1967     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1968 }
1969
1970 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1971 {
1972     uint8_t **p_fref[2] =
1973         { h->mb.pic.p_fref[0][a->l0.i_ref],
1974           h->mb.pic.p_fref[1][a->l1.i_ref] };
1975     ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
1976     ALIGNED_4( int16_t mvc[2][2] );
1977     int i, l;
1978
1979     h->mb.i_partition = D_8x16;
1980     a->i_cost8x16bi = 0;
1981
1982     for( i = 0; i < 2; i++ )
1983     {
1984         int i_part_cost;
1985         int i_part_cost_bi = 0;
1986         int stride[2] = {8,8};
1987         uint8_t *src[2];
1988
1989         for( l = 0; l < 2; l++ )
1990         {
1991             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1992             const int i_ref_cost = REF_COST( l, lX->i_ref );
1993             x264_me_t *m = &lX->me8x16[i];
1994
1995             m->i_pixel = PIXEL_8x16;
1996             m->i_ref_cost = i_ref_cost;
1997
1998             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1999             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2000
2001             CP32( mvc[0], lX->me8x8[i].mv );
2002             CP32( mvc[1], lX->me8x8[i+2].mv );
2003
2004             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2005             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2006             x264_me_search( h, m, mvc, 2 );
2007             m->cost += i_ref_cost;
2008
2009             /* BI mode */
2010             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
2011                                     m->mv[0], m->mv[1], 8, 16, weight_none );
2012             i_part_cost_bi += m->cost_mv + i_ref_cost;
2013         }
2014
2015         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2016         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2017
2018         i_part_cost = a->l0.me8x16[i].cost;
2019         a->i_mb_partition8x16[i] = D_L0_8x8;
2020         if( a->l1.me8x16[i].cost < i_part_cost )
2021         {
2022             i_part_cost = a->l1.me8x16[i].cost;
2023             a->i_mb_partition8x16[i] = D_L1_8x8;
2024         }
2025         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2026         {
2027             i_part_cost = i_part_cost_bi;
2028             a->i_mb_partition8x16[i] = D_BI_8x8;
2029         }
2030         a->i_cost8x16bi += i_part_cost;
2031
2032         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2033     }
2034
2035     /* mb type cost */
2036     a->i_mb_type8x16 = B_L0_L0
2037         + (a->i_mb_partition8x16[0]>>2) * 3
2038         + (a->i_mb_partition8x16[1]>>2);
2039     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2040 }
2041
2042 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2043 {
2044     int thresh = i_satd * 5/4;
2045
2046     h->mb.i_type = P_L0;
2047     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2048     {
2049         h->mb.i_partition = D_16x16;
2050         x264_analyse_update_cache( h, a );
2051         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2052     }
2053
2054     if( a->l0.i_cost16x8 <= thresh )
2055     {
2056         h->mb.i_partition = D_16x8;
2057         x264_analyse_update_cache( h, a );
2058         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2059     }
2060     else
2061         a->l0.i_cost16x8 = COST_MAX;
2062
2063     if( a->l0.i_cost8x16 <= thresh )
2064     {
2065         h->mb.i_partition = D_8x16;
2066         x264_analyse_update_cache( h, a );
2067         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2068     }
2069     else
2070         a->l0.i_cost8x16 = COST_MAX;
2071
2072     if( a->l0.i_cost8x8 <= thresh )
2073     {
2074         h->mb.i_type = P_8x8;
2075         h->mb.i_partition = D_8x8;
2076         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2077         {
2078             int i;
2079             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2080             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2081             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2082             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2083             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2084              * for future blocks are those left over from previous RDO calls. */
2085             for( i = 0; i < 4; i++ )
2086             {
2087                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2088                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2089                 int subtype, btype = D_L0_8x8;
2090                 uint64_t bcost = COST_MAX64;
2091                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2092                 {
2093                     uint64_t cost;
2094                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2095                         continue;
2096                     h->mb.i_sub_partition[i] = subtype;
2097                     x264_mb_cache_mv_p8x8( h, a, i );
2098                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2099                     COPY2_IF_LT( bcost, cost, btype, subtype );
2100                 }
2101                 if( h->mb.i_sub_partition[i] != btype )
2102                 {
2103                     h->mb.i_sub_partition[i] = btype;
2104                     x264_mb_cache_mv_p8x8( h, a, i );
2105                 }
2106             }
2107         }
2108         else
2109             x264_analyse_update_cache( h, a );
2110         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2111     }
2112     else
2113         a->l0.i_cost8x8 = COST_MAX;
2114 }
2115
2116 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2117 {
2118     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2119
2120     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2121     {
2122         h->mb.i_type = B_DIRECT;
2123         /* Assumes direct/skip MC is still in fdec */
2124         /* Requires b-rdo to be done before intra analysis */
2125         h->mb.b_skip_mc = 1;
2126         x264_analyse_update_cache( h, a );
2127         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2128         h->mb.b_skip_mc = 0;
2129     }
2130
2131     //FIXME not all the update_cache calls are needed
2132     h->mb.i_partition = D_16x16;
2133     /* L0 */
2134     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2135     {
2136         h->mb.i_type = B_L0_L0;
2137         x264_analyse_update_cache( h, a );
2138         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2139     }
2140
2141     /* L1 */
2142     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2143     {
2144         h->mb.i_type = B_L1_L1;
2145         x264_analyse_update_cache( h, a );
2146         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2147     }
2148
2149     /* BI */
2150     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2151     {
2152         h->mb.i_type = B_BI_BI;
2153         x264_analyse_update_cache( h, a );
2154         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2155     }
2156
2157     /* 8x8 */
2158     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2159     {
2160         h->mb.i_type = B_8x8;
2161         h->mb.i_partition = D_8x8;
2162         x264_analyse_update_cache( h, a );
2163         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2164         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2165     }
2166
2167     /* 16x8 */
2168     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2169     {
2170         h->mb.i_type = a->i_mb_type16x8;
2171         h->mb.i_partition = D_16x8;
2172         x264_analyse_update_cache( h, a );
2173         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2174     }
2175
2176     /* 8x16 */
2177     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2178     {
2179         h->mb.i_type = a->i_mb_type8x16;
2180         h->mb.i_partition = D_8x16;
2181         x264_analyse_update_cache( h, a );
2182         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2183     }
2184 }
2185
2186 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2187 {
2188     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2189     int i;
2190
2191     if( IS_INTRA(h->mb.i_type) )
2192         return;
2193
2194     switch( h->mb.i_partition )
2195     {
2196         case D_16x16:
2197             if( h->mb.i_type == B_BI_BI )
2198                 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2199             break;
2200         case D_16x8:
2201             for( i=0; i<2; i++ )
2202                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2203                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2204             break;
2205         case D_8x16:
2206             for( i=0; i<2; i++ )
2207                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2208                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2209             break;
2210         case D_8x8:
2211             for( i=0; i<4; i++ )
2212                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2213                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2214             break;
2215     }
2216 }
2217
2218 static inline void x264_mb_analyse_transform( x264_t *h )
2219 {
2220     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2221     {
2222         int i_cost4, i_cost8;
2223         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2224         x264_mb_mc( h );
2225
2226         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2227                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2228         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2229                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2230
2231         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2232         h->mb.b_skip_mc = 1;
2233     }
2234 }
2235
2236 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2237 {
2238     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2239     {
2240         int i_rd8;
2241         x264_analyse_update_cache( h, a );
2242         h->mb.b_transform_8x8 ^= 1;
2243         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2244         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2245
2246         if( *i_rd >= i_rd8 )
2247         {
2248             if( *i_rd > 0 )
2249                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2250             *i_rd = i_rd8;
2251         }
2252         else
2253             h->mb.b_transform_8x8 ^= 1;
2254     }
2255 }
2256
2257 /* Rate-distortion optimal QP selection.
2258  * FIXME: More than half of the benefit of this function seems to be
2259  * in the way it improves the coding of chroma DC (by decimating or
2260  * finding a better way to code a single DC coefficient.)
2261  * There must be a more efficient way to get that portion of the benefit
2262  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2263  * trick. */
2264 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2265 {
2266     int bcost, cost, direction, failures, prevcost, origcost;
2267     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2268     int last_qp_tried = 0;
2269     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2270
2271     /* If CBP is already zero, don't raise the quantizer any higher. */
2272     for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2273     {
2274         /* Without psy-RD, require monotonicity when moving quant away from previous
2275          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2276          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2277          * allow 2 failures when moving quant towards previous quant.
2278          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2279         int threshold = (!!h->mb.i_psy_rd);
2280         /* Raise the threshold for failures if we're moving towards the last QP. */
2281         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2282             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2283             threshold++;
2284         h->mb.i_qp = orig_qp;
2285         failures = 0;
2286         prevcost = origcost;
2287         h->mb.i_qp += direction;
2288         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2289         {
2290             if( h->mb.i_last_qp == h->mb.i_qp )
2291                 last_qp_tried = 1;
2292             h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2293             cost = x264_rd_cost_mb( h, a->i_lambda2 );
2294             COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2295
2296             /* We can't assume that the costs are monotonic over QPs.
2297              * Tie case-as-failure seems to give better results. */
2298             if( cost < prevcost )
2299                 failures = 0;
2300             else
2301                 failures++;
2302             prevcost = cost;
2303
2304             if( failures > threshold )
2305                 break;
2306             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2307                 break;
2308             h->mb.i_qp += direction;
2309         }
2310     }
2311
2312     /* Always try the last block's QP. */
2313     if( !last_qp_tried )
2314     {
2315         h->mb.i_qp = h->mb.i_last_qp;
2316         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2317         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2318         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2319     }
2320
2321     h->mb.i_qp = bqp;
2322     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2323
2324     /* Check transform again; decision from before may no longer be optimal. */
2325     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2326         x264_mb_transform_8x8_allowed( h ) )
2327     {
2328         h->mb.b_transform_8x8 ^= 1;
2329         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2330         if( cost > bcost )
2331             h->mb.b_transform_8x8 ^= 1;
2332     }
2333 }
2334
2335 /*****************************************************************************
2336  * x264_macroblock_analyse:
2337  *****************************************************************************/
2338 void x264_macroblock_analyse( x264_t *h )
2339 {
2340     x264_mb_analysis_t analysis;
2341     int i_cost = COST_MAX;
2342     int i;
2343
2344     h->mb.i_qp = x264_ratecontrol_qp( h );
2345     if( h->param.rc.i_aq_mode )
2346     {
2347         x264_adaptive_quant( h );
2348         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2349          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2350         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2351             h->mb.i_qp = h->mb.i_last_qp;
2352     }
2353
2354     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2355
2356     /*--------------------------- Do the analysis ---------------------------*/
2357     if( h->sh.i_type == SLICE_TYPE_I )
2358     {
2359 intra_analysis:
2360         if( analysis.i_mbrd )
2361             x264_mb_cache_fenc_satd( h );
2362         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2363         if( analysis.i_mbrd )
2364             x264_intra_rd( h, &analysis, COST_MAX );
2365
2366         i_cost = analysis.i_satd_i16x16;
2367         h->mb.i_type = I_16x16;
2368         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2369         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2370         if( analysis.i_satd_pcm < i_cost )
2371             h->mb.i_type = I_PCM;
2372
2373         else if( analysis.i_mbrd >= 2 )
2374             x264_intra_rd_refine( h, &analysis );
2375     }
2376     else if( h->sh.i_type == SLICE_TYPE_P )
2377     {
2378         int b_skip = 0;
2379
2380         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2381
2382         analysis.b_try_pskip = 0;
2383         if( analysis.b_force_intra )
2384         {
2385             if( !h->param.analyse.b_psy )
2386             {
2387                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2388                 goto intra_analysis;
2389             }
2390         }
2391         else
2392         {
2393             /* Fast P_SKIP detection */
2394             if( h->param.analyse.b_fast_pskip )
2395             {
2396                 if( h->param.i_threads > 1 && !h->param.b_sliced_threads && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2397                     // FIXME don't need to check this if the reference frame is done
2398                     {}
2399                 else if( h->param.analyse.i_subpel_refine >= 3 )
2400                     analysis.b_try_pskip = 1;
2401                 else if( h->mb.i_mb_type_left == P_SKIP ||
2402                          h->mb.i_mb_type_top == P_SKIP ||
2403                          h->mb.i_mb_type_topleft == P_SKIP ||
2404                          h->mb.i_mb_type_topright == P_SKIP )
2405                     b_skip = x264_macroblock_probe_pskip( h );
2406             }
2407         }
2408
2409         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2410
2411         if( b_skip )
2412         {
2413             h->mb.i_type = P_SKIP;
2414             h->mb.i_partition = D_16x16;
2415             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
2416         }
2417         else
2418         {
2419             const unsigned int flags = h->param.analyse.inter;
2420             int i_type;
2421             int i_partition;
2422             int i_thresh16x8;
2423             int i_satd_inter, i_satd_intra;
2424
2425             x264_mb_analyse_load_costs( h, &analysis );
2426
2427             x264_mb_analyse_inter_p16x16( h, &analysis );
2428
2429             if( h->mb.i_type == P_SKIP )
2430                 return;
2431
2432             if( flags & X264_ANALYSE_PSUB16x16 )
2433             {
2434                 if( h->param.analyse.b_mixed_references )
2435                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2436                 else
2437                     x264_mb_analyse_inter_p8x8( h, &analysis );
2438             }
2439
2440             /* Select best inter mode */
2441             i_type = P_L0;
2442             i_partition = D_16x16;
2443             i_cost = analysis.l0.me16x16.cost;
2444
2445             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2446                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2447             {
2448                 i_type = P_8x8;
2449                 i_partition = D_8x8;
2450                 i_cost = analysis.l0.i_cost8x8;
2451
2452                 /* Do sub 8x8 */
2453                 if( flags & X264_ANALYSE_PSUB8x8 )
2454                 {
2455                     for( i = 0; i < 4; i++ )
2456                     {
2457                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2458                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2459                         {
2460                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2461                             h->mb.i_sub_partition[i] = D_L0_4x4;
2462
2463                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2464                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2465                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2466
2467                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2468                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2469                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2470
2471                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2472                         }
2473                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2474                     }
2475                     analysis.l0.i_cost8x8 = i_cost;
2476                 }
2477             }
2478
2479             /* Now do 16x8/8x16 */
2480             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2481             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2482                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2483             {
2484                 x264_mb_analyse_inter_p16x8( h, &analysis );
2485                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2486
2487                 x264_mb_analyse_inter_p8x16( h, &analysis );
2488                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2489             }
2490
2491             h->mb.i_partition = i_partition;
2492
2493             /* refine qpel */
2494             //FIXME mb_type costs?
2495             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2496             {
2497                 /* refine later */
2498             }
2499             else if( i_partition == D_16x16 )
2500             {
2501                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2502                 i_cost = analysis.l0.me16x16.cost;
2503             }
2504             else if( i_partition == D_16x8 )
2505             {
2506                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2507                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2508                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2509             }
2510             else if( i_partition == D_8x16 )
2511             {
2512                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2513                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2514                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2515             }
2516             else if( i_partition == D_8x8 )
2517             {
2518                 int i8x8;
2519                 i_cost = 0;
2520                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2521                 {
2522                     switch( h->mb.i_sub_partition[i8x8] )
2523                     {
2524                         case D_L0_8x8:
2525                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2526                             i_cost += analysis.l0.me8x8[i8x8].cost;
2527                             break;
2528                         case D_L0_8x4:
2529                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2530                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2531                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2532                                       analysis.l0.me8x4[i8x8][1].cost;
2533                             break;
2534                         case D_L0_4x8:
2535                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2536                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2537                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2538                                       analysis.l0.me4x8[i8x8][1].cost;
2539                             break;
2540
2541                         case D_L0_4x4:
2542                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2543                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2544                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2545                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2546                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2547                                       analysis.l0.me4x4[i8x8][1].cost +
2548                                       analysis.l0.me4x4[i8x8][2].cost +
2549                                       analysis.l0.me4x4[i8x8][3].cost;
2550                             break;
2551                         default:
2552                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2553                             break;
2554                     }
2555                 }
2556             }
2557
2558             if( h->mb.b_chroma_me )
2559             {
2560                 x264_mb_analyse_intra_chroma( h, &analysis );
2561                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2562                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2563                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2564                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2565             }
2566             else
2567                 x264_mb_analyse_intra( h, &analysis, i_cost );
2568
2569             i_satd_inter = i_cost;
2570             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2571                                       analysis.i_satd_i8x8,
2572                                       analysis.i_satd_i4x4 );
2573
2574             if( analysis.i_mbrd )
2575             {
2576                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2577                 i_type = P_L0;
2578                 i_partition = D_16x16;
2579                 i_cost = analysis.l0.i_rd16x16;
2580                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2581                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2582                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2583                 h->mb.i_type = i_type;
2584                 h->mb.i_partition = i_partition;
2585                 if( i_cost < COST_MAX )
2586                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2587                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2588             }
2589
2590             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2591             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2592             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2593             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2594
2595             if( analysis.b_force_intra && !IS_INTRA(i_type) )
2596             {
2597                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2598                  * it was an inter block. */
2599                 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2600                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2601                 h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2602                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2603                 goto intra_analysis;
2604             }
2605
2606             h->mb.i_type = i_type;
2607
2608             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2609             {
2610                 if( IS_INTRA( h->mb.i_type ) )
2611                 {
2612                     x264_intra_rd_refine( h, &analysis );
2613                 }
2614                 else if( i_partition == D_16x16 )
2615                 {
2616                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2617                     analysis.l0.me16x16.cost = i_cost;
2618                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2619                 }
2620                 else if( i_partition == D_16x8 )
2621                 {
2622                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2623                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2624                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2625                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2626                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2627                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2628                 }
2629                 else if( i_partition == D_8x16 )
2630                 {
2631                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2632                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2633                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2634                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2635                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2636                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2637                 }
2638                 else if( i_partition == D_8x8 )
2639                 {
2640                     int i8x8;
2641                     x264_analyse_update_cache( h, &analysis );
2642                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2643                     {
2644                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2645                         {
2646                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2647                         }
2648                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2649                         {
2650                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2651                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2652                         }
2653                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2654                         {
2655                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2656                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2657                         }
2658                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2659                         {
2660                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2661                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2662                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2663                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2664                         }
2665                     }
2666                 }
2667             }
2668         }
2669     }
2670     else if( h->sh.i_type == SLICE_TYPE_B )
2671     {
2672         int i_bskip_cost = COST_MAX;
2673         int b_skip = 0;
2674
2675         if( analysis.i_mbrd )
2676             x264_mb_cache_fenc_satd( h );
2677
2678         h->mb.i_type = B_SKIP;
2679         if( h->mb.b_direct_auto_write )
2680         {
2681             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2682             for( i = 0; i < 2; i++ )
2683             {
2684                 int b_changed = 1;
2685                 h->sh.b_direct_spatial_mv_pred ^= 1;
2686                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2687                 if( analysis.b_direct_available )
2688                 {
2689                     if( b_changed )
2690                     {
2691                         x264_mb_mc( h );
2692                         b_skip = x264_macroblock_probe_bskip( h );
2693                     }
2694                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2695                 }
2696                 else
2697                     b_skip = 0;
2698             }
2699         }
2700         else
2701             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2702
2703         if( analysis.b_direct_available )
2704         {
2705             if( !h->mb.b_direct_auto_write )
2706                 x264_mb_mc( h );
2707             if( analysis.i_mbrd )
2708             {
2709                 i_bskip_cost = ssd_mb( h );
2710                 /* 6 = minimum cavlc cost of a non-skipped MB */
2711                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2712             }
2713             else if( !h->mb.b_direct_auto_write )
2714             {
2715                 /* Conditioning the probe on neighboring block types
2716                  * doesn't seem to help speed or quality. */
2717                 b_skip = x264_macroblock_probe_bskip( h );
2718             }
2719         }
2720
2721         if( !b_skip )
2722         {
2723             const unsigned int flags = h->param.analyse.inter;
2724             int i_type;
2725             int i_partition;
2726             int i_satd_inter;
2727             h->mb.b_skip_mc = 0;
2728
2729             x264_mb_analyse_load_costs( h, &analysis );
2730
2731             /* select best inter mode */
2732             /* direct must be first */
2733             if( analysis.b_direct_available )
2734                 x264_mb_analyse_inter_direct( h, &analysis );
2735
2736             x264_mb_analyse_inter_b16x16( h, &analysis );
2737
2738             i_type = B_L0_L0;
2739             i_partition = D_16x16;
2740             i_cost = analysis.l0.me16x16.cost;
2741             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2742             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2743             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2744
2745             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2746             {
2747                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2748                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2749                     i_bskip_cost < analysis.i_rd16x16bi &&
2750                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2751                     i_bskip_cost < analysis.l1.i_rd16x16 )
2752                 {
2753                     h->mb.i_type = B_SKIP;
2754                     x264_analyse_update_cache( h, &analysis );
2755                     return;
2756                 }
2757             }
2758
2759             if( flags & X264_ANALYSE_BSUB16x16 )
2760             {
2761                 x264_mb_analyse_inter_b8x8( h, &analysis );
2762                 if( analysis.i_cost8x8bi < i_cost )
2763                 {
2764                     i_type = B_8x8;
2765                     i_partition = D_8x8;
2766                     i_cost = analysis.i_cost8x8bi;
2767
2768                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2769                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2770                     {
2771                         x264_mb_analyse_inter_b16x8( h, &analysis );
2772                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2773                                      i_type, analysis.i_mb_type16x8,
2774                                      i_partition, D_16x8 );
2775                     }
2776                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2777                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2778                     {
2779                         x264_mb_analyse_inter_b8x16( h, &analysis );
2780                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2781                                      i_type, analysis.i_mb_type8x16,
2782                                      i_partition, D_8x16 );
2783                     }
2784                 }
2785             }
2786
2787             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2788             {
2789                 /* refine later */
2790             }
2791             /* refine qpel */
2792             else if( i_partition == D_16x16 )
2793             {
2794                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2795                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2796                 if( i_type == B_L0_L0 )
2797                 {
2798                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2799                     i_cost = analysis.l0.me16x16.cost
2800                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2801                 }
2802                 else if( i_type == B_L1_L1 )
2803                 {
2804                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2805                     i_cost = analysis.l1.me16x16.cost
2806                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2807                 }
2808                 else if( i_type == B_BI_BI )
2809                 {
2810                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2811                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2812                 }
2813             }
2814             else if( i_partition == D_16x8 )
2815             {
2816                 for( i=0; i<2; i++ )
2817                 {
2818                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2819                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2820                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2821                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2822                 }
2823             }
2824             else if( i_partition == D_8x16 )
2825             {
2826                 for( i=0; i<2; i++ )
2827                 {
2828                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2829                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2830                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2831                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2832                 }
2833             }
2834             else if( i_partition == D_8x8 )
2835             {
2836                 for( i=0; i<4; i++ )
2837                 {
2838                     x264_me_t *m;
2839                     int i_part_cost_old;
2840                     int i_type_cost;
2841                     int i_part_type = h->mb.i_sub_partition[i];
2842                     int b_bidir = (i_part_type == D_BI_8x8);
2843
2844                     if( i_part_type == D_DIRECT_8x8 )
2845                         continue;
2846                     if( x264_mb_partition_listX_table[0][i_part_type] )
2847                     {
2848                         m = &analysis.l0.me8x8[i];
2849                         i_part_cost_old = m->cost;
2850                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2851                         m->cost -= i_type_cost;
2852                         x264_me_refine_qpel( h, m );
2853                         if( !b_bidir )
2854                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2855                     }
2856                     if( x264_mb_partition_listX_table[1][i_part_type] )
2857                     {
2858                         m = &analysis.l1.me8x8[i];
2859                         i_part_cost_old = m->cost;
2860                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2861                         m->cost -= i_type_cost;
2862                         x264_me_refine_qpel( h, m );
2863                         if( !b_bidir )
2864                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2865                     }
2866                     /* TODO: update mvp? */
2867                 }
2868             }
2869
2870             i_satd_inter = i_cost;
2871
2872             if( analysis.i_mbrd )
2873             {
2874                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2875                 i_type = B_SKIP;
2876                 i_cost = i_bskip_cost;
2877                 i_partition = D_16x16;
2878                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2879                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2880                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2881                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2882                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2883                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2884                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2885
2886                 h->mb.i_type = i_type;
2887                 h->mb.i_partition = i_partition;
2888             }
2889
2890             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2891
2892             if( analysis.i_mbrd )
2893             {
2894                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2895                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2896             }
2897
2898             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2899             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2900             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2901             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2902
2903             h->mb.i_type = i_type;
2904             h->mb.i_partition = i_partition;
2905
2906             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2907                 x264_intra_rd_refine( h, &analysis );
2908             if( h->mb.i_subpel_refine >= 5 )
2909                 x264_refine_bidir( h, &analysis );
2910
2911             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2912             {
2913                 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2914                 x264_analyse_update_cache( h, &analysis );
2915
2916                 if( i_partition == D_16x16 )
2917                 {
2918                     if( i_type == B_L0_L0 )
2919                     {
2920                         analysis.l0.me16x16.cost = i_cost;
2921                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2922                     }
2923                     else if( i_type == B_L1_L1 )
2924                     {
2925                         analysis.l1.me16x16.cost = i_cost;
2926                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2927                     }
2928                     else if( i_type == B_BI_BI )
2929                         x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2930                 }
2931                 else if( i_partition == D_16x8 )
2932                 {
2933                     for( i = 0; i < 2; i++ )
2934                     {
2935                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2936                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2937                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2938                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2939                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2940                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2941                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2942                     }
2943                 }
2944                 else if( i_partition == D_8x16 )
2945                 {
2946                     for( i = 0; i < 2; i++ )
2947                     {
2948                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2949                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2950                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2951                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2952                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2953                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2954                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2955                     }
2956                 }
2957                 else if( i_partition == D_8x8 )
2958                 {
2959                     for( i = 0; i < 4; i++ )
2960                     {
2961                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2962                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2963                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2964                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2965                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2966                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2967                     }
2968                 }
2969             }
2970         }
2971     }
2972
2973     x264_analyse_update_cache( h, &analysis );
2974
2975     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
2976      * without realizing it.  Check for this and account for it if necessary. */
2977     if( analysis.i_mbrd >= 2 )
2978     {
2979         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
2980         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
2981         int list = check_mv_lists[h->mb.i_type] - 1;
2982         if( list >= 0 && h->mb.i_partition != D_16x16 &&
2983             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
2984             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
2985                 h->mb.i_partition = D_16x16;
2986     }
2987
2988     if( !analysis.i_mbrd )
2989         x264_mb_analyse_transform( h );
2990
2991     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2992         x264_mb_analyse_qp_rd( h, &analysis );
2993
2994     h->mb.b_trellis = h->param.analyse.i_trellis;
2995     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2996     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2997         x264_psy_trellis_init( h, 0 );
2998     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2999         h->mb.i_skip_intra = 0;
3000 }
3001
3002 /*-------------------- Update MB from the analysis ----------------------*/
3003 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3004 {
3005     int i;
3006
3007     switch( h->mb.i_type )
3008     {
3009         case I_4x4:
3010             for( i = 0; i < 16; i++ )
3011                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3012
3013             x264_mb_analyse_intra_chroma( h, a );
3014             break;
3015         case I_8x8:
3016             for( i = 0; i < 4; i++ )
3017                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3018
3019             x264_mb_analyse_intra_chroma( h, a );
3020             break;
3021         case I_16x16:
3022             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3023             x264_mb_analyse_intra_chroma( h, a );
3024             break;
3025
3026         case I_PCM:
3027             break;
3028
3029         case P_L0:
3030             switch( h->mb.i_partition )
3031             {
3032                 case D_16x16:
3033                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3034                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3035                     break;
3036
3037                 case D_16x8:
3038                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3039                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3040                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3041                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3042                     break;
3043
3044                 case D_8x16:
3045                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3046                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3047                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3048                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3049                     break;
3050
3051                 default:
3052                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3053                     break;
3054             }
3055             break;
3056
3057         case P_8x8:
3058             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3059             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3060             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3061             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3062             for( i = 0; i < 4; i++ )
3063                 x264_mb_cache_mv_p8x8( h, a, i );
3064             break;
3065
3066         case P_SKIP:
3067         {
3068             h->mb.i_partition = D_16x16;
3069             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3070             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3071             break;
3072         }
3073
3074         case B_SKIP:
3075         case B_DIRECT:
3076             x264_mb_load_mv_direct8x8( h, 0 );
3077             x264_mb_load_mv_direct8x8( h, 1 );
3078             x264_mb_load_mv_direct8x8( h, 2 );
3079             x264_mb_load_mv_direct8x8( h, 3 );
3080             break;
3081
3082         case B_8x8:
3083             /* optimize: cache might not need to be rewritten */
3084             for( i = 0; i < 4; i++ )
3085                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3086             break;
3087
3088         default: /* the rest of the B types */
3089             switch( h->mb.i_partition )
3090             {
3091             case D_16x16:
3092                 switch( h->mb.i_type )
3093                 {
3094                 case B_L0_L0:
3095                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3096                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3097
3098                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3099                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3100                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3101                     break;
3102                 case B_L1_L1:
3103                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3104                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3105                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3106
3107                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3108                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3109                     break;
3110                 case B_BI_BI:
3111                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3112                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3113
3114                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3115                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3116                     break;
3117                 }
3118                 break;
3119             case D_16x8:
3120                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3121                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3122                 break;
3123             case D_8x16:
3124                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3125                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3126                 break;
3127             default:
3128                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3129                 break;
3130             }
3131     }
3132
3133 #ifndef NDEBUG
3134     if( h->param.i_threads > 1 && !h->param.b_sliced_threads && !IS_INTRA(h->mb.i_type) )
3135     {
3136         int l;
3137         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3138         {
3139             int completed;
3140             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3141             if( ref < 0 )
3142                 continue;
3143             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3144             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3145             {
3146                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3147                 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3148                 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3149                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3150                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3151                 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3152                 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3153                 fprintf(stderr, "completed: %d \n", completed );
3154                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3155                 x264_mb_analyse_intra( h, a, COST_MAX );
3156                 h->mb.i_type = I_16x16;
3157                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3158                 x264_mb_analyse_intra_chroma( h, a );
3159             }
3160         }
3161     }
3162 #endif
3163 }
3164
3165 #include "slicetype.c"
3166