git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #define _ISOC99_SOURCE
  26 #include <math.h>
  27 #include <limits.h>
  28 #ifndef _MSC_VER
  29 #include <unistd.h>
  30 #endif
  31
  32 #include "common/common.h"
  33 #include "common/cpu.h"
  34 #include "macroblock.h"
  35 #include "me.h"
  36 #include "ratecontrol.h"
  37 #include "analyse.h"
  38 #include "rdo.c"
  39
  40 typedef struct
  41 {
  42     /* 16x16 */
  43     int i_ref;
  44     int       i_rd16x16;
  45     x264_me_t me16x16;
  46
  47     /* 8x8 */
  48     int       i_cost8x8;
  49     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  50     ALIGNED_4( int16_t mvc[32][5][2] );
  51     x264_me_t me8x8[4];
  52
  53     /* Sub 4x4 */
  54     int       i_cost4x4[4]; /* cost per 8x8 partition */
  55     x264_me_t me4x4[4][4];
  56
  57     /* Sub 8x4 */
  58     int       i_cost8x4[4]; /* cost per 8x8 partition */
  59     x264_me_t me8x4[4][2];
  60
  61     /* Sub 4x8 */
  62     int       i_cost4x8[4]; /* cost per 8x8 partition */
  63     x264_me_t me4x8[4][2];
  64
  65     /* 16x8 */
  66     int       i_cost16x8;
  67     x264_me_t me16x8[2];
  68
  69     /* 8x16 */
  70     int       i_cost8x16;
  71     x264_me_t me8x16[2];
  72
  73 } x264_mb_analysis_list_t;
  74
  75 typedef struct
  76 {
  77     /* conduct the analysis using this lamda and QP */
  78     int i_lambda;
  79     int i_lambda2;
  80     int i_qp;
  81     int16_t *p_cost_mv;
  82     uint16_t *p_cost_ref0;
  83     uint16_t *p_cost_ref1;
  84     int i_mbrd;
  85
  86
  87     /* I: Intra part */
  88     /* Take some shortcuts in intra search if intra is deemed unlikely */
  89     int b_fast_intra;
  90     int b_try_pskip;
  91
  92     /* Luma part */
  93     int i_satd_i16x16;
  94     int i_satd_i16x16_dir[7];
  95     int i_predict16x16;
  96
  97     int i_satd_i8x8;
  98     int i_cbp_i8x8_luma;
  99     int i_satd_i8x8_dir[12][4];
 100     int i_predict8x8[4];
 101
 102     int i_satd_i4x4;
 103     int i_predict4x4[16];
 104
 105     int i_satd_pcm;
 106
 107     /* Chroma part */
 108     int i_satd_i8x8chroma;
 109     int i_satd_i8x8chroma_dir[4];
 110     int i_predict8x8chroma;
 111
 112     /* II: Inter part P/B frame */
 113     x264_mb_analysis_list_t l0;
 114     x264_mb_analysis_list_t l1;
 115
 116     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 117     int i_cost16x16direct;
 118     int i_cost8x8bi;
 119     int i_cost8x8direct[4];
 120     int i_cost16x8bi;
 121     int i_cost8x16bi;
 122     int i_rd16x16bi;
 123     int i_rd16x16direct;
 124     int i_rd16x8bi;
 125     int i_rd8x16bi;
 126     int i_rd8x8bi;
 127
 128     int i_mb_partition16x8[2]; /* mb_partition_e */
 129     int i_mb_partition8x16[2];
 130     int i_mb_type16x8; /* mb_class_e */
 131     int i_mb_type8x16;
 132
 133     int b_direct_available;
 134
 135 } x264_mb_analysis_t;
 136
 137 /* lambda = pow(2,qp/6-2) */
 138 const int x264_lambda_tab[52] = {
 139    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
 140    1, 1, 1, 1,              /*  8-11 */
 141    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
 142    3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
 143    6, 7, 8, 9,10,11,13,14,  /* 28-35 */
 144   16,18,20,23,25,29,32,36,  /* 36-43 */
 145   40,45,51,57,64,72,81,91   /* 44-51 */
 146 };
 147
 148 /* lambda2 = pow(lambda,2) * .9 * 256 */
 149 const int x264_lambda2_tab[52] = {
 150     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
 151     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
 152    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
 153   3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
 154  23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
 155 148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
 156 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 157 };
 158
 159 const uint8_t x264_exp2_lut[64] = {
 160       1,   4,   7,  10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,  44,  47,
 161      50,  53,  57,  60,  64,  67,  71,  74,  78,  81,  85,  89,  93,  96, 100, 104,
 162     108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
 163     177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
 164 };
 165
 166 const float x264_log2_lut[128] = {
 167     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 168     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 169     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 170     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 171     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 172     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 173     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 174     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 175     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 176     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 177     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 178     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 179     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 180     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 181     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 182     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 183 };
 184
 185 /* Avoid an int/float conversion. */
 186 const float x264_log2_lz_lut[32] = {
 187     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 188 };
 189
 190 // should the intra and inter lambdas be different?
 191 // I'm just matching the behaviour of deadzone quant.
 192 static const int x264_trellis_lambda2_tab[2][52] = {
 193     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 194     {    46,      58,      73,      92,     117,     147,
 195         185,     233,     294,     370,     466,     587,
 196         740,     932,    1174,    1480,    1864,    2349,
 197        2959,    3728,    4697,    5918,    7457,    9395,
 198       11837,   14914,   18790,   23674,   29828,   37581,
 199       47349,   59656,   75163,   94699,  119313,  150326,
 200      189399,  238627,  300652,  378798,  477255,  601304,
 201      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 202     3030384, 3818045, 4810435, 6060769 },
 203     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 204     {    27,      34,      43,      54,      68,      86,
 205         108,     136,     172,     216,     273,     343,
 206         433,     545,     687,     865,    1090,    1374,
 207        1731,    2180,    2747,    3461,    4361,    5494,
 208        6922,    8721,   10988,   13844,   17442,   21976,
 209       27688,   34885,   43953,   55377,   69771,   87906,
 210      110755,  139543,  175813,  221511,  279087,  351627,
 211      443023,  558174,  703255,  886046, 1116348, 1406511,
 212     1772093, 2232697, 2813022, 3544186 }
 213 };
 214
 215 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
 216        16,    20,    25,    32,    40,    50,
 217        64,    80,   101,   128,   161,   203,
 218       256,   322,   406,   512,   645,   812,
 219      1024,  1290,  1625,  2048,  2580,  3250,
 220      4096,  5160,  6501,  8192, 10321, 13003,
 221     16384, 20642, 26007, 32768, 41285, 52015,
 222     65535
 223 };
 224
 225 /* TODO: calculate CABAC costs */
 226 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
 227     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 228 };
 229 static const int i_mb_b16x8_cost_table[17] = {
 230     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 231 };
 232 static const int i_sub_mb_b_cost_table[13] = {
 233     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 234 };
 235 static const int i_sub_mb_p_cost_table[4] = {
 236     5, 3, 3, 1
 237 };
 238
 239 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 240
 241 /* Indexed by lambda instead of qp because, due to rounding,
 242  * some quantizers share lambdas.  This saves memory. */
 243 uint16_t *x264_cost_mv_fpel[92][4];
 244 uint16_t x264_cost_ref[92][3][33];
 245
 246 /* initialize an array of lambda*nbits for all possible mvs */
 247 static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 248 {
 249     static int16_t *p_cost_mv[92];
 250     int i, j;
 251
 252     if( !p_cost_mv[a->i_lambda] )
 253     {
 254         x264_emms();
 255         /* could be faster, but isn't called many times */
 256         /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 257         CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
 258         p_cost_mv[a->i_lambda] += 2*4*2048;
 259         for( i = 0; i <= 2*4*2048; i++ )
 260         {
 261             p_cost_mv[a->i_lambda][-i] =
 262             p_cost_mv[a->i_lambda][i]  = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
 263         }
 264         for( i = 0; i < 3; i++ )
 265             for( j = 0; j < 33; j++ )
 266                 x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
 267     }
 268     a->p_cost_mv = p_cost_mv[a->i_lambda];
 269     a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 270     a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 271
 272     /* FIXME is this useful for all me methods? */
 273     if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
 274     {
 275         for( j=0; j<4; j++ )
 276         {
 277             CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
 278             x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
 279             for( i = -2*2048; i < 2*2048; i++ )
 280                 x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
 281         }
 282     }
 283     return 0;
 284 fail:
 285     return -1;
 286 }
 287
 288 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 289 {
 290     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 291
 292     /* mbrd == 1 -> RD mode decision */
 293     /* mbrd == 2 -> RD refinement */
 294     /* mbrd == 3 -> QPRD */
 295     a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
 296
 297     /* conduct the analysis using this lamda and QP */
 298     a->i_qp = h->mb.i_qp = i_qp;
 299     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
 300
 301     a->i_lambda = x264_lambda_tab[i_qp];
 302     a->i_lambda2 = x264_lambda2_tab[i_qp];
 303
 304     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 305     if( h->param.analyse.i_trellis )
 306     {
 307         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
 308         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
 309         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
 310         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
 311     }
 312     h->mb.i_psy_rd_lambda = a->i_lambda;
 313     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 314     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
 315
 316     h->mb.i_me_method = h->param.analyse.i_me_method;
 317     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
 318     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
 319                         && h->mb.i_subpel_refine >= 5;
 320
 321     h->mb.b_transform_8x8 = 0;
 322     h->mb.b_noise_reduction = 0;
 323
 324     /* I: Intra part */
 325     a->i_satd_i16x16 =
 326     a->i_satd_i8x8   =
 327     a->i_satd_i4x4   =
 328     a->i_satd_i8x8chroma = COST_MAX;
 329
 330     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
 331     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
 332
 333     a->b_fast_intra = 0;
 334     h->mb.i_skip_intra =
 335         h->mb.b_lossless ? 0 :
 336         a->i_mbrd ? 2 :
 337         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 338
 339     /* II: Inter part P/B frame */
 340     if( h->sh.i_type != SLICE_TYPE_I )
 341     {
 342         int i, j;
 343         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 344         // limit motion search to a slightly smaller range than the theoretical limit,
 345         // since the search may go a few iterations past its given range
 346         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 347
 348         /* Calculate max allowed MV range */
 349 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 350         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 351         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 352         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 353         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 354         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 355         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 356         if( h->mb.i_mb_x == 0)
 357         {
 358             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
 359             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
 360             int thread_mvy_range = i_fmv_range;
 361
 362             if( h->param.i_threads > 1 )
 363             {
 364                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
 365                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 366                 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 367                 {
 368                     x264_frame_t **fref = i ? h->fref1 : h->fref0;
 369                     int i_ref = i ? h->i_ref1 : h->i_ref0;
 370                     for( j=0; j<i_ref; j++ )
 371                     {
 372                         x264_frame_cond_wait( fref[j], thresh );
 373                         thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
 374                     }
 375                 }
 376                 if( h->param.b_deterministic )
 377                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 378                 if( h->mb.b_interlaced )
 379                     thread_mvy_range >>= 1;
 380             }
 381
 382             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 383             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
 384             h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 385             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 386             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 387             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 388             h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 389         }
 390 #undef CLIP_FMV
 391
 392         a->l0.me16x16.cost =
 393         a->l0.i_rd16x16    =
 394         a->l0.i_cost8x8    = COST_MAX;
 395
 396         for( i = 0; i < 4; i++ )
 397         {
 398             a->l0.i_cost4x4[i] =
 399             a->l0.i_cost8x4[i] =
 400             a->l0.i_cost4x8[i] = COST_MAX;
 401         }
 402
 403         a->l0.i_cost16x8   =
 404         a->l0.i_cost8x16   = COST_MAX;
 405         if( h->sh.i_type == SLICE_TYPE_B )
 406         {
 407             a->l1.me16x16.cost =
 408             a->l1.i_rd16x16    =
 409             a->l1.i_cost8x8    = COST_MAX;
 410
 411             for( i = 0; i < 4; i++ )
 412             {
 413                 a->l1.i_cost4x4[i] =
 414                 a->l1.i_cost8x4[i] =
 415                 a->l1.i_cost4x8[i] =
 416                 a->i_cost8x8direct[i] = COST_MAX;
 417             }
 418
 419             a->l1.i_cost16x8   =
 420             a->l1.i_cost8x16   =
 421             a->i_rd16x16bi     =
 422             a->i_rd16x16direct =
 423             a->i_rd8x8bi       =
 424             a->i_rd16x8bi      =
 425             a->i_rd8x16bi      =
 426             a->i_cost16x16bi   =
 427             a->i_cost16x16direct =
 428             a->i_cost8x8bi     =
 429             a->i_cost16x8bi    =
 430             a->i_cost8x16bi    = COST_MAX;
 431         }
 432
 433         /* Fast intra decision */
 434         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 435         {
 436             if(   IS_INTRA( h->mb.i_mb_type_left )
 437                || IS_INTRA( h->mb.i_mb_type_top )
 438                || IS_INTRA( h->mb.i_mb_type_topleft )
 439                || IS_INTRA( h->mb.i_mb_type_topright )
 440                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
 441                || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
 442             { /* intra is likely */ }
 443             else
 444             {
 445                 a->b_fast_intra = 1;
 446             }
 447         }
 448         h->mb.b_skip_mc = 0;
 449     }
 450 }
 451
 452
 453
 454 /*
 455  * Handle intra mb
 456  */
 457 /* Max = 4 */
 458 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 459 {
 460     if( i_neighbour & MB_TOPLEFT )
 461     {
 462         /* top and left available */
 463         *mode++ = I_PRED_16x16_V;
 464         *mode++ = I_PRED_16x16_H;
 465         *mode++ = I_PRED_16x16_DC;
 466         *mode++ = I_PRED_16x16_P;
 467         *pi_count = 4;
 468     }
 469     else if( i_neighbour & MB_LEFT )
 470     {
 471         /* left available*/
 472         *mode++ = I_PRED_16x16_DC_LEFT;
 473         *mode++ = I_PRED_16x16_H;
 474         *pi_count = 2;
 475     }
 476     else if( i_neighbour & MB_TOP )
 477     {
 478         /* top available*/
 479         *mode++ = I_PRED_16x16_DC_TOP;
 480         *mode++ = I_PRED_16x16_V;
 481         *pi_count = 2;
 482     }
 483     else
 484     {
 485         /* none available */
 486         *mode = I_PRED_16x16_DC_128;
 487         *pi_count = 1;
 488     }
 489 }
 490
 491 /* Max = 4 */
 492 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 493 {
 494     if( i_neighbour & MB_TOPLEFT )
 495     {
 496         /* top and left available */
 497         *mode++ = I_PRED_CHROMA_V;
 498         *mode++ = I_PRED_CHROMA_H;
 499         *mode++ = I_PRED_CHROMA_DC;
 500         *mode++ = I_PRED_CHROMA_P;
 501         *pi_count = 4;
 502     }
 503     else if( i_neighbour & MB_LEFT )
 504     {
 505         /* left available*/
 506         *mode++ = I_PRED_CHROMA_DC_LEFT;
 507         *mode++ = I_PRED_CHROMA_H;
 508         *pi_count = 2;
 509     }
 510     else if( i_neighbour & MB_TOP )
 511     {
 512         /* top available*/
 513         *mode++ = I_PRED_CHROMA_DC_TOP;
 514         *mode++ = I_PRED_CHROMA_V;
 515         *pi_count = 2;
 516     }
 517     else
 518     {
 519         /* none available */
 520         *mode = I_PRED_CHROMA_DC_128;
 521         *pi_count = 1;
 522     }
 523 }
 524
 525 /* MAX = 9 */
 526 static void predict_4x4_mode_available( unsigned int i_neighbour,
 527                                         int *mode, int *pi_count )
 528 {
 529     int b_l = i_neighbour & MB_LEFT;
 530     int b_t = i_neighbour & MB_TOP;
 531
 532     if( b_l && b_t )
 533     {
 534         *pi_count = 6;
 535         *mode++ = I_PRED_4x4_DC;
 536         *mode++ = I_PRED_4x4_H;
 537         *mode++ = I_PRED_4x4_V;
 538         *mode++ = I_PRED_4x4_DDL;
 539         if( i_neighbour & MB_TOPLEFT )
 540         {
 541             *mode++ = I_PRED_4x4_DDR;
 542             *mode++ = I_PRED_4x4_VR;
 543             *mode++ = I_PRED_4x4_HD;
 544             *pi_count += 3;
 545         }
 546         *mode++ = I_PRED_4x4_VL;
 547         *mode++ = I_PRED_4x4_HU;
 548     }
 549     else if( b_l )
 550     {
 551         *mode++ = I_PRED_4x4_DC_LEFT;
 552         *mode++ = I_PRED_4x4_H;
 553         *mode++ = I_PRED_4x4_HU;
 554         *pi_count = 3;
 555     }
 556     else if( b_t )
 557     {
 558         *mode++ = I_PRED_4x4_DC_TOP;
 559         *mode++ = I_PRED_4x4_V;
 560         *mode++ = I_PRED_4x4_DDL;
 561         *mode++ = I_PRED_4x4_VL;
 562         *pi_count = 4;
 563     }
 564     else
 565     {
 566         *mode++ = I_PRED_4x4_DC_128;
 567         *pi_count = 1;
 568     }
 569 }
 570
 571 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 572 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 573 {
 574     ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
 575     ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
 576     ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 577     int i;
 578
 579     if( do_both_dct || h->mb.b_transform_8x8 )
 580     {
 581         h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
 582         for( i = 0; i < 4; i++ )
 583             h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
 584     }
 585     if( do_both_dct || !h->mb.b_transform_8x8 )
 586     {
 587         h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
 588         for( i = 0; i < 16; i++ )
 589             h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
 590     }
 591 }
 592
 593 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
 594 static inline void x264_mb_cache_fenc_satd( x264_t *h )
 595 {
 596     ALIGNED_16( static uint8_t zero[16] ) = {0};
 597     uint8_t *fenc;
 598     int x, y, satd_sum = 0, sa8d_sum = 0;
 599     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 600         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 601     if( !h->mb.i_psy_rd )
 602         return;
 603     for( y = 0; y < 4; y++ )
 604         for( x = 0; x < 4; x++ )
 605         {
 606             fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
 607             h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
 608                                       - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
 609             satd_sum += h->mb.pic.fenc_satd[y][x];
 610         }
 611     for( y = 0; y < 2; y++ )
 612         for( x = 0; x < 2; x++ )
 613         {
 614             fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
 615             h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
 616                                       - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
 617             sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
 618         }
 619     h->mb.pic.fenc_satd_sum = satd_sum;
 620     h->mb.pic.fenc_sa8d_sum = sa8d_sum;
 621 }
 622
 623 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 624 {
 625     int i;
 626
 627     int i_max;
 628     int predict_mode[4];
 629     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 630
 631     uint8_t *p_dstc[2], *p_srcc[2];
 632
 633     if( a->i_satd_i8x8chroma < COST_MAX )
 634         return;
 635
 636     /* 8x8 prediction selection for chroma */
 637     p_dstc[0] = h->mb.pic.p_fdec[1];
 638     p_dstc[1] = h->mb.pic.p_fdec[2];
 639     p_srcc[0] = h->mb.pic.p_fenc[1];
 640     p_srcc[1] = h->mb.pic.p_fenc[2];
 641
 642     predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 643     a->i_satd_i8x8chroma = COST_MAX;
 644     if( i_max == 4 && b_merged_satd )
 645     {
 646         int satdu[4], satdv[4];
 647         h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
 648         h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
 649         h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
 650         h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
 651         satdu[I_PRED_CHROMA_P] =
 652             h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
 653         satdv[I_PRED_CHROMA_P] =
 654             h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
 655
 656         for( i=0; i<i_max; i++ )
 657         {
 658             int i_mode = predict_mode[i];
 659             int i_satd = satdu[i_mode] + satdv[i_mode]
 660                        + a->i_lambda * bs_size_ue(i_mode);
 661
 662             a->i_satd_i8x8chroma_dir[i] = i_satd;
 663             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 664         }
 665     }
 666     else
 667     {
 668         for( i=0; i<i_max; i++ )
 669         {
 670             int i_satd;
 671             int i_mode = predict_mode[i];
 672
 673             /* we do the prediction */
 674             if( h->mb.b_lossless )
 675                 x264_predict_lossless_8x8_chroma( h, i_mode );
 676             else
 677             {
 678                 h->predict_8x8c[i_mode]( p_dstc[0] );
 679                 h->predict_8x8c[i_mode]( p_dstc[1] );
 680             }
 681
 682             /* we calculate the cost */
 683             i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
 684                                                p_srcc[0], FENC_STRIDE ) +
 685                      h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
 686                                                p_srcc[1], FENC_STRIDE ) +
 687                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 688
 689             a->i_satd_i8x8chroma_dir[i] = i_satd;
 690             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
 691         }
 692     }
 693
 694     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 695 }
 696
 697 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 698 {
 699     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 700     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 701     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 702
 703     int i, idx;
 704     int i_max;
 705     int predict_mode[9];
 706     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 707
 708     /*---------------- Try all mode and calculate their score ---------------*/
 709
 710     /* 16x16 prediction selection */
 711     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 712
 713     if( b_merged_satd && i_max == 4 )
 714     {
 715         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 716         h->predict_16x16[I_PRED_16x16_P]( p_dst );
 717         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
 718             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 719         for( i=0; i<4; i++ )
 720         {
 721             int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
 722             COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
 723         }
 724     }
 725     else
 726     {
 727         for( i = 0; i < i_max; i++ )
 728         {
 729             int i_satd;
 730             int i_mode = predict_mode[i];
 731
 732             if( h->mb.b_lossless )
 733                 x264_predict_lossless_16x16( h, i_mode );
 734             else
 735                 h->predict_16x16[i_mode]( p_dst );
 736
 737             i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 738                     a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 739             COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 740             a->i_satd_i16x16_dir[i_mode] = i_satd;
 741         }
 742     }
 743
 744     if( h->sh.i_type == SLICE_TYPE_B )
 745         /* cavlc mb type prefix */
 746         a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
 747     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
 748         return;
 749
 750     /* 8x8 prediction selection */
 751     if( flags & X264_ANALYSE_I8x8 )
 752     {
 753         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
 754         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 755         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 756         int i_cost = 0;
 757         h->mb.i_cbp_luma = 0;
 758         b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
 759
 760         // FIXME some bias like in i4x4?
 761         if( h->sh.i_type == SLICE_TYPE_B )
 762             i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 763
 764         for( idx = 0;; idx++ )
 765         {
 766             int x = idx&1;
 767             int y = idx>>1;
 768             uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 769             uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 770             int i_best = COST_MAX;
 771             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 772
 773             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
 774             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 775
 776             if( b_merged_satd && i_max == 9 )
 777             {
 778                 int satd[9];
 779                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 780                 satd[i_pred_mode] -= 3 * a->i_lambda;
 781                 for( i=2; i>=0; i-- )
 782                 {
 783                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
 784                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 785                 }
 786                 i = 3;
 787             }
 788             else
 789                 i = 0;
 790
 791             for( ; i<i_max; i++ )
 792             {
 793                 int i_satd;
 794                 int i_mode = predict_mode[i];
 795
 796                 if( h->mb.b_lossless )
 797                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
 798                 else
 799                     h->predict_8x8[i_mode]( p_dst_by, edge );
 800
 801                 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
 802                        + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 803
 804                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 805                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
 806             }
 807             i_cost += i_best;
 808
 809             if( idx == 3 || i_cost > i_satd_thresh )
 810                 break;
 811
 812             /* we need to encode this block now (for next ones) */
 813             h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 814             x264_mb_encode_i8x8( h, idx, a->i_qp );
 815
 816             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 817         }
 818
 819         if( idx == 3 )
 820         {
 821             a->i_satd_i8x8 = i_cost;
 822             if( h->mb.i_skip_intra )
 823             {
 824                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 825                 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
 826                 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
 827                 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
 828                 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
 829                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 830                 if( h->mb.i_skip_intra == 2 )
 831                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 832             }
 833         }
 834         else
 835         {
 836             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 837             a->i_satd_i8x8 = COST_MAX;
 838             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 839         }
 840         if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
 841             return;
 842     }
 843
 844     /* 4x4 prediction selection */
 845     if( flags & X264_ANALYSE_I4x4 )
 846     {
 847         int i_cost;
 848         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
 849         h->mb.i_cbp_luma = 0;
 850         b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
 851         if( a->i_mbrd )
 852             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
 853
 854         i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
 855         if( h->sh.i_type == SLICE_TYPE_B )
 856             i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
 857
 858         for( idx = 0;; idx++ )
 859         {
 860             uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
 861             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
 862             int i_best = COST_MAX;
 863             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 864
 865             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 866
 867             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 868                 /* emulate missing topright samples */
 869                 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 870
 871             if( b_merged_satd && i_max >= 6 )
 872             {
 873                 int satd[9];
 874                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 875                 satd[i_pred_mode] -= 3 * a->i_lambda;
 876                 for( i=2; i>=0; i-- )
 877                     COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
 878                                  a->i_predict4x4[idx], i );
 879                 i = 3;
 880             }
 881             else
 882                 i = 0;
 883
 884             for( ; i<i_max; i++ )
 885             {
 886                 int i_satd;
 887                 int i_mode = predict_mode[i];
 888                 if( h->mb.b_lossless )
 889                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
 890                 else
 891                     h->predict_4x4[i_mode]( p_dst_by );
 892
 893                 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
 894                                                    p_src_by, FENC_STRIDE )
 895                        + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 896
 897                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
 898             }
 899             i_cost += i_best;
 900
 901             if( i_cost > i_satd_thresh || idx == 15 )
 902                 break;
 903
 904             /* we need to encode this block now (for next ones) */
 905             h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
 906             x264_mb_encode_i4x4( h, idx, a->i_qp );
 907
 908             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
 909         }
 910         if( idx == 15 )
 911         {
 912             a->i_satd_i4x4 = i_cost;
 913             if( h->mb.i_skip_intra )
 914             {
 915                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 916                 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
 917                 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
 918                 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
 919                 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
 920                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
 921                 if( h->mb.i_skip_intra == 2 )
 922                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
 923             }
 924         }
 925         else
 926             a->i_satd_i4x4 = COST_MAX;
 927     }
 928 }
 929
 930 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
 931 {
 932     if( a->i_satd_i16x16 <= i_satd_thresh )
 933     {
 934         h->mb.i_type = I_16x16;
 935         x264_analyse_update_cache( h, a );
 936         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
 937     }
 938     else
 939         a->i_satd_i16x16 = COST_MAX;
 940
 941     if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
 942     {
 943         h->mb.i_type = I_4x4;
 944         x264_analyse_update_cache( h, a );
 945         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
 946     }
 947     else
 948         a->i_satd_i4x4 = COST_MAX;
 949
 950     if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
 951     {
 952         h->mb.i_type = I_8x8;
 953         x264_analyse_update_cache( h, a );
 954         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
 955         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
 956     }
 957     else
 958         a->i_satd_i8x8 = COST_MAX;
 959 }
 960
 961 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 962 {
 963     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 964
 965     int i, j, idx, x, y;
 966     int i_max, i_mode, i_thresh;
 967     uint64_t i_satd, i_best;
 968     int predict_mode[9];
 969     h->mb.i_skip_intra = 0;
 970
 971     if( h->mb.i_type == I_16x16 )
 972     {
 973         int old_pred_mode = a->i_predict16x16;
 974         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
 975         i_best = a->i_satd_i16x16;
 976         predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 977         for( i = 0; i < i_max; i++ )
 978         {
 979             int i_mode = predict_mode[i];
 980             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
 981                 continue;
 982             h->mb.i_intra16x16_pred_mode = i_mode;
 983             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
 984             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
 985         }
 986     }
 987
 988     /* RD selection for chroma prediction */
 989     predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
 990     if( i_max > 1 )
 991     {
 992         i_thresh = a->i_satd_i8x8chroma * 5/4;
 993
 994         for( i = j = 0; i < i_max; i++ )
 995             if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
 996                 predict_mode[i] != a->i_predict8x8chroma )
 997             {
 998                 predict_mode[j++] = predict_mode[i];
 999             }
1000         i_max = j;
1001
1002         if( i_max > 0 )
1003         {
1004             int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1005             int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1006             /* the previous thing encoded was x264_intra_rd(), so the pixels and
1007              * coefs for the current chroma mode are still around, so we only
1008              * have to recount the bits. */
1009             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1010             for( i = 0; i < i_max; i++ )
1011             {
1012                 i_mode = predict_mode[i];
1013                 if( h->mb.b_lossless )
1014                     x264_predict_lossless_8x8_chroma( h, i_mode );
1015                 else
1016                 {
1017                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1018                     h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1019                 }
1020                 /* if we've already found a mode that needs no residual, then
1021                  * probably any mode with a residual will be worse.
1022                  * so avoid dct on the remaining modes to improve speed. */
1023                 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1024                 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1025             }
1026             h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1027             h->mb.i_cbp_chroma = i_cbp_chroma_best;
1028         }
1029     }
1030
1031     if( h->mb.i_type == I_4x4 )
1032     {
1033         uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1034         int i_nnz = 0;
1035         for( idx = 0; idx < 16; idx++ )
1036         {
1037             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1038             i_best = COST_MAX64;
1039
1040             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1041
1042             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1043                 /* emulate missing topright samples */
1044                 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1045
1046             for( i = 0; i < i_max; i++ )
1047             {
1048                 i_mode = predict_mode[i];
1049                 if( h->mb.b_lossless )
1050                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1051                 else
1052                     h->predict_4x4[i_mode]( p_dst_by );
1053                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1054
1055                 if( i_best > i_satd )
1056                 {
1057                     a->i_predict4x4[idx] = i_mode;
1058                     i_best = i_satd;
1059                     pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1060                     pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1061                     pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1062                     pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1063                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1064                 }
1065             }
1066
1067             *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1068             *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1069             *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1070             *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1071             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1072
1073             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1074         }
1075     }
1076     else if( h->mb.i_type == I_8x8 )
1077     {
1078         ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1079         for( idx = 0; idx < 4; idx++ )
1080         {
1081             uint64_t pels_h = 0;
1082             uint8_t pels_v[7];
1083             uint16_t i_nnz[2];
1084             uint8_t *p_dst_by;
1085             int j;
1086             int cbp_luma_new = 0;
1087             i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1088
1089             i_best = COST_MAX64;
1090             x = idx&1;
1091             y = idx>>1;
1092
1093             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1094             predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1095             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1096
1097             for( i = 0; i < i_max; i++ )
1098             {
1099                 i_mode = predict_mode[i];
1100                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1101                     continue;
1102                 if( h->mb.b_lossless )
1103                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1104                 else
1105                     h->predict_8x8[i_mode]( p_dst_by, edge );
1106                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1107                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1108
1109                 if( i_best > i_satd )
1110                 {
1111                     a->i_predict8x8[idx] = i_mode;
1112                     cbp_luma_new = h->mb.i_cbp_luma;
1113                     i_best = i_satd;
1114
1115                     pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1116                     if( !(idx&1) )
1117                         for( j=0; j<7; j++ )
1118                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1119                     i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1120                     i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1121                 }
1122             }
1123             a->i_cbp_i8x8_luma = cbp_luma_new;
1124             *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1125             if( !(idx&1) )
1126                 for( j=0; j<7; j++ )
1127                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1128             *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1129             *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1130
1131             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1132         }
1133     }
1134 }
1135
1136 #define LOAD_FENC( m, src, xoff, yoff) \
1137     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1138     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1139     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1140     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1141     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1142
1143 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1144     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1145     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1146     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1147     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1148     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1149     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1150     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1151
1152 #define REF_COST(list, ref) \
1153     (a->p_cost_ref##list[ref])
1154
1155 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1156 {
1157     x264_me_t m;
1158     int i_ref, i_mvc;
1159     ALIGNED_4( int16_t mvc[8][2] );
1160     int i_halfpel_thresh = INT_MAX;
1161     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1162
1163     /* 16x16 Search on all ref frame */
1164     m.i_pixel = PIXEL_16x16;
1165     m.p_cost_mv = a->p_cost_mv;
1166     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1167
1168     a->l0.me16x16.cost = INT_MAX;
1169     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1170     {
1171         const int i_ref_cost = REF_COST( 0, i_ref );
1172         i_halfpel_thresh -= i_ref_cost;
1173         m.i_ref_cost = i_ref_cost;
1174         m.i_ref = i_ref;
1175
1176         /* search with ref */
1177         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1178         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1179         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1180         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1181
1182         /* early termination
1183          * SSD threshold would probably be better than SATD */
1184         if( i_ref == 0
1185             && a->b_try_pskip
1186             && m.cost-m.cost_mv < 300*a->i_lambda
1187             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1188               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1189             && x264_macroblock_probe_pskip( h ) )
1190         {
1191             h->mb.i_type = P_SKIP;
1192             x264_analyse_update_cache( h, a );
1193             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1194             return;
1195         }
1196
1197         m.cost += i_ref_cost;
1198         i_halfpel_thresh += i_ref_cost;
1199
1200         if( m.cost < a->l0.me16x16.cost )
1201             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1202
1203         /* save mv for predicting neighbors */
1204         *(uint32_t*)a->l0.mvc[i_ref][0] =
1205         *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1206     }
1207
1208     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1209     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1210
1211     h->mb.i_type = P_L0;
1212     if( a->i_mbrd )
1213     {
1214         x264_mb_cache_fenc_satd( h );
1215         if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1216         {
1217             h->mb.i_partition = D_16x16;
1218             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1219             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1220         }
1221     }
1222 }
1223
1224 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1225 {
1226     x264_me_t m;
1227     int i_ref;
1228     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1229     int i_halfpel_thresh = INT_MAX;
1230     int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1231     int i;
1232     int i_maxref = h->mb.pic.i_fref[0]-1;
1233
1234     h->mb.i_partition = D_8x8;
1235
1236     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1237      * than those used by the neighbors */
1238     if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1239         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1240     {
1241         i_maxref = 0;
1242         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1243         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1244         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1245         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1246         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1247         i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1248     }
1249
1250     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1251          *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1252
1253     for( i = 0; i < 4; i++ )
1254     {
1255         x264_me_t *l0m = &a->l0.me8x8[i];
1256         const int x8 = i%2;
1257         const int y8 = i/2;
1258
1259         m.i_pixel = PIXEL_8x8;
1260         m.p_cost_mv = a->p_cost_mv;
1261
1262         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1263         l0m->cost = INT_MAX;
1264         for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1265         {
1266             const int i_ref_cost = REF_COST( 0, i_ref );
1267             i_halfpel_thresh -= i_ref_cost;
1268             m.i_ref_cost = i_ref_cost;
1269             m.i_ref = i_ref;
1270
1271             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1272             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1273             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1274             x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1275
1276             m.cost += i_ref_cost;
1277             i_halfpel_thresh += i_ref_cost;
1278             *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1279
1280             if( m.cost < l0m->cost )
1281                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1282         }
1283         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1284         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1285
1286         /* mb type cost */
1287         l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1288     }
1289
1290     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1291                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1292     /* P_8x8 ref0 has no ref cost */
1293     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1294                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1295         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1296     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1297     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1298 }
1299
1300 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1301 {
1302     const int i_ref = a->l0.me16x16.i_ref;
1303     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1304     uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
1305     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1306     int i_mvc;
1307     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1308     int i;
1309
1310     /* XXX Needed for x264_mb_predict_mv */
1311     h->mb.i_partition = D_8x8;
1312
1313     i_mvc = 1;
1314     *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1315
1316     for( i = 0; i < 4; i++ )
1317     {
1318         x264_me_t *m = &a->l0.me8x8[i];
1319         const int x8 = i%2;
1320         const int y8 = i/2;
1321
1322         m->i_pixel = PIXEL_8x8;
1323         m->p_cost_mv = a->p_cost_mv;
1324         m->i_ref_cost = i_ref_cost;
1325         m->i_ref = i_ref;
1326
1327         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1328         LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1329         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1330         x264_me_search( h, m, mvc, i_mvc );
1331
1332         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1333
1334         *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1335         i_mvc++;
1336
1337         /* mb type cost */
1338         m->cost += i_ref_cost;
1339         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1340     }
1341
1342     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1343                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1344     /* theoretically this should include 4*ref_cost,
1345      * but 3 seems a better approximation of cabac. */
1346     if( h->param.b_cabac )
1347         a->l0.i_cost8x8 -= i_ref_cost;
1348     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1349     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1350 }
1351
1352 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1353 {
1354     x264_me_t m;
1355     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1356     ALIGNED_4( int16_t mvc[3][2] );
1357     int i, j;
1358
1359     /* XXX Needed for x264_mb_predict_mv */
1360     h->mb.i_partition = D_16x8;
1361
1362     for( i = 0; i < 2; i++ )
1363     {
1364         x264_me_t *l0m = &a->l0.me16x8[i];
1365         const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1366         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1367
1368         m.i_pixel = PIXEL_16x8;
1369         m.p_cost_mv = a->p_cost_mv;
1370
1371         LOAD_FENC( &m, p_fenc, 0, 8*i );
1372         l0m->cost = INT_MAX;
1373         for( j = 0; j < i_ref8s; j++ )
1374         {
1375             const int i_ref = ref8[j];
1376             const int i_ref_cost = REF_COST( 0, i_ref );
1377             m.i_ref_cost = i_ref_cost;
1378             m.i_ref = i_ref;
1379
1380             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1381             *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1382             *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1383             *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1384
1385             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1386             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1387             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1388             x264_me_search( h, &m, mvc, 3 );
1389
1390             m.cost += i_ref_cost;
1391
1392             if( m.cost < l0m->cost )
1393                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1394         }
1395         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1396         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1397     }
1398
1399     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1400 }
1401
1402 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1403 {
1404     x264_me_t m;
1405     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1406     ALIGNED_4( int16_t mvc[3][2] );
1407     int i, j;
1408
1409     /* XXX Needed for x264_mb_predict_mv */
1410     h->mb.i_partition = D_8x16;
1411
1412     for( i = 0; i < 2; i++ )
1413     {
1414         x264_me_t *l0m = &a->l0.me8x16[i];
1415         const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1416         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1417
1418         m.i_pixel = PIXEL_8x16;
1419         m.p_cost_mv = a->p_cost_mv;
1420
1421         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1422         l0m->cost = INT_MAX;
1423         for( j = 0; j < i_ref8s; j++ )
1424         {
1425             const int i_ref = ref8[j];
1426             const int i_ref_cost = REF_COST( 0, i_ref );
1427             m.i_ref_cost = i_ref_cost;
1428             m.i_ref = i_ref;
1429
1430             *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1431             *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1432             *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1433
1434             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1435             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1436             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1437             x264_me_search( h, &m, mvc, 3 );
1438
1439             m.cost += i_ref_cost;
1440
1441             if( m.cost < l0m->cost )
1442                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1443         }
1444         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1445         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1446     }
1447
1448     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1449 }
1450
1451 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1452 {
1453     ALIGNED_8( uint8_t pix1[16*8] );
1454     uint8_t *pix2 = pix1+8;
1455     const int i_stride = h->mb.pic.i_stride[1];
1456     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1457     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1458
1459 #define CHROMA4x4MC( width, height, me, x, y ) \
1460     h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1461     h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1462
1463     if( pixel == PIXEL_4x4 )
1464     {
1465         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1466         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
1467         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
1468         CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1469     }
1470     else if( pixel == PIXEL_8x4 )
1471     {
1472         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1473         CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1474     }
1475     else
1476     {
1477         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1478         CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1479     }
1480
1481     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1482          + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1483 }
1484
1485 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1486 {
1487     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1488     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1489     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1490     int i4x4;
1491
1492     /* XXX Needed for x264_mb_predict_mv */
1493     h->mb.i_partition = D_8x8;
1494
1495     for( i4x4 = 0; i4x4 < 4; i4x4++ )
1496     {
1497         const int idx = 4*i8x8 + i4x4;
1498         const int x4 = block_idx_x[idx];
1499         const int y4 = block_idx_y[idx];
1500         const int i_mvc = (i4x4 == 0);
1501
1502         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1503
1504         m->i_pixel = PIXEL_4x4;
1505         m->p_cost_mv = a->p_cost_mv;
1506
1507         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1508         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1509
1510         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1511         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1512
1513         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1514     }
1515     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1516                             a->l0.me4x4[i8x8][1].cost +
1517                             a->l0.me4x4[i8x8][2].cost +
1518                             a->l0.me4x4[i8x8][3].cost +
1519                             REF_COST( 0, i_ref ) +
1520                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1521     if( h->mb.b_chroma_me )
1522         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1523 }
1524
1525 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1526 {
1527     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1528     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1529     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1530     int i8x4;
1531
1532     /* XXX Needed for x264_mb_predict_mv */
1533     h->mb.i_partition = D_8x8;
1534
1535     for( i8x4 = 0; i8x4 < 2; i8x4++ )
1536     {
1537         const int idx = 4*i8x8 + 2*i8x4;
1538         const int x4 = block_idx_x[idx];
1539         const int y4 = block_idx_y[idx];
1540         const int i_mvc = (i8x4 == 0);
1541
1542         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1543
1544         m->i_pixel = PIXEL_8x4;
1545         m->p_cost_mv = a->p_cost_mv;
1546
1547         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1548         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1549
1550         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1551         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1552
1553         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1554     }
1555     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1556                             REF_COST( 0, i_ref ) +
1557                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1558     if( h->mb.b_chroma_me )
1559         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1560 }
1561
1562 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1563 {
1564     uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1565     uint8_t  **p_fenc = h->mb.pic.p_fenc;
1566     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1567     int i4x8;
1568
1569     /* XXX Needed for x264_mb_predict_mv */
1570     h->mb.i_partition = D_8x8;
1571
1572     for( i4x8 = 0; i4x8 < 2; i4x8++ )
1573     {
1574         const int idx = 4*i8x8 + i4x8;
1575         const int x4 = block_idx_x[idx];
1576         const int y4 = block_idx_y[idx];
1577         const int i_mvc = (i4x8 == 0);
1578
1579         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1580
1581         m->i_pixel = PIXEL_4x8;
1582         m->p_cost_mv = a->p_cost_mv;
1583
1584         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1585         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1586
1587         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1588         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1589
1590         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1591     }
1592     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1593                             REF_COST( 0, i_ref ) +
1594                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1595     if( h->mb.b_chroma_me )
1596         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1597 }
1598
1599 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1600 {
1601     /* Assumes that fdec still contains the results of
1602      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1603
1604     uint8_t **p_fenc = h->mb.pic.p_fenc;
1605     uint8_t **p_fdec = h->mb.pic.p_fdec;
1606     int i;
1607
1608     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1609     for( i = 0; i < 4; i++ )
1610     {
1611         const int x = (i&1)*8;
1612         const int y = (i>>1)*8;
1613         a->i_cost16x16direct +=
1614         a->i_cost8x8direct[i] =
1615             h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1616
1617         /* mb type cost */
1618         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1619     }
1620 }
1621
1622 #define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
1623 { \
1624     h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1625 }
1626
1627 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1628 {
1629     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1630     ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1631     uint8_t *src0, *src1;
1632     int stride0 = 16, stride1 = 16;
1633
1634     x264_me_t m;
1635     int i_ref, i_mvc;
1636     ALIGNED_4( int16_t mvc[9][2] );
1637     int i_halfpel_thresh = INT_MAX;
1638     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1639
1640     /* 16x16 Search on all ref frame */
1641     m.i_pixel = PIXEL_16x16;
1642     m.p_cost_mv = a->p_cost_mv;
1643     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1644
1645     /* ME for List 0 */
1646     a->l0.me16x16.cost = INT_MAX;
1647     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1648     {
1649         /* search with ref */
1650         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1651         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1652         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1653         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1654
1655         /* add ref cost */
1656         m.cost += REF_COST( 0, i_ref );
1657
1658         if( m.cost < a->l0.me16x16.cost )
1659         {
1660             a->l0.i_ref = i_ref;
1661             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1662         }
1663
1664         /* save mv for predicting neighbors */
1665         *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1666     }
1667     /* subtract ref cost, so we don't have to add it for the other MB types */
1668     a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1669
1670     /* ME for list 1 */
1671     i_halfpel_thresh = INT_MAX;
1672     p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1673     a->l1.me16x16.cost = INT_MAX;
1674     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1675     {
1676         /* search with ref */
1677         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1678         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1679         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1680         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1681
1682         /* add ref cost */
1683         m.cost += REF_COST( 1, i_ref );
1684
1685         if( m.cost < a->l1.me16x16.cost )
1686         {
1687             a->l1.i_ref = i_ref;
1688             h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1689         }
1690
1691         /* save mv for predicting neighbors */
1692         *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1693     }
1694     /* subtract ref cost, so we don't have to add it for the other MB types */
1695     a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1696
1697     /* Set global ref, needed for other modes? */
1698     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1699     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1700
1701     /* get cost of BI mode */
1702     src0 = h->mc.get_ref( pix0, &stride0,
1703                            h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1704                            a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1705     src1 = h->mc.get_ref( pix1, &stride1,
1706                            h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1707                            a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1708
1709     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1710
1711     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1712                      + REF_COST( 0, a->l0.i_ref )
1713                      + REF_COST( 1, a->l1.i_ref )
1714                      + a->l0.me16x16.cost_mv
1715                      + a->l1.me16x16.cost_mv;
1716
1717     /* mb type cost */
1718     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1719     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1720     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1721 }
1722
1723 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1724 {
1725     const int x = 2*(i%2);
1726     const int y = 2*(i/2);
1727
1728     switch( h->mb.i_sub_partition[i] )
1729     {
1730         case D_L0_8x8:
1731             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1732             break;
1733         case D_L0_8x4:
1734             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1735             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1736             break;
1737         case D_L0_4x8:
1738             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1739             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1740             break;
1741         case D_L0_4x4:
1742             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1743             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1744             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1745             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1746             break;
1747         default:
1748             x264_log( h, X264_LOG_ERROR, "internal error\n" );
1749             break;
1750     }
1751 }
1752
1753 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1754     if( x264_mb_partition_listX_table[0][part] ) \
1755     { \
1756         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1757         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1758     } \
1759     else \
1760     { \
1761         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1762         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
1763         if( b_mvd ) \
1764             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1765     } \
1766     if( x264_mb_partition_listX_table[1][part] ) \
1767     { \
1768         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1769         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1770     } \
1771     else \
1772     { \
1773         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1774         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
1775         if( b_mvd ) \
1776             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1777     }
1778
1779 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1780 {
1781     int x = (i%2)*2;
1782     int y = (i/2)*2;
1783     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1784     {
1785         x264_mb_load_mv_direct8x8( h, i );
1786         if( b_mvd )
1787         {
1788             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
1789             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
1790             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1791         }
1792     }
1793     else
1794     {
1795         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1796     }
1797 }
1798 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1799 {
1800     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1801 }
1802 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1803 {
1804     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1805 }
1806 #undef CACHE_MV_BI
1807
1808 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1809 {
1810     uint8_t **p_fref[2] =
1811         { h->mb.pic.p_fref[0][a->l0.i_ref],
1812           h->mb.pic.p_fref[1][a->l1.i_ref] };
1813     ALIGNED_8( uint8_t pix[2][8*8] );
1814     int i, l;
1815
1816     /* XXX Needed for x264_mb_predict_mv */
1817     h->mb.i_partition = D_8x8;
1818
1819     a->i_cost8x8bi = 0;
1820
1821     for( i = 0; i < 4; i++ )
1822     {
1823         const int x8 = i%2;
1824         const int y8 = i/2;
1825         int i_part_cost;
1826         int i_part_cost_bi = 0;
1827         int stride[2] = {8,8};
1828         uint8_t *src[2];
1829
1830         for( l = 0; l < 2; l++ )
1831         {
1832             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1833             x264_me_t *m = &lX->me8x8[i];
1834
1835             m->i_pixel = PIXEL_8x8;
1836             m->p_cost_mv = a->p_cost_mv;
1837
1838             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1839             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1840
1841             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1842             x264_me_search( h, m, &lX->me16x16.mv, 1 );
1843
1844             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1845
1846             /* BI mode */
1847             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1848                                     m->mv[0], m->mv[1], 8, 8 );
1849             i_part_cost_bi += m->cost_mv;
1850             /* FIXME: ref cost */
1851         }
1852         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1853         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1854                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1855         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1856         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1857
1858         i_part_cost = a->l0.me8x8[i].cost;
1859         h->mb.i_sub_partition[i] = D_L0_8x8;
1860         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1861         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1862         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1863         a->i_cost8x8bi += i_part_cost;
1864
1865         /* XXX Needed for x264_mb_predict_mv */
1866         x264_mb_cache_mv_b8x8( h, a, i, 0 );
1867     }
1868
1869     /* mb type cost */
1870     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1871 }
1872
1873 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1874 {
1875     uint8_t **p_fref[2] =
1876         { h->mb.pic.p_fref[0][a->l0.i_ref],
1877           h->mb.pic.p_fref[1][a->l1.i_ref] };
1878     ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1879     ALIGNED_4( int16_t mvc[2][2] );
1880     int i, l;
1881
1882     h->mb.i_partition = D_16x8;
1883     a->i_cost16x8bi = 0;
1884
1885     for( i = 0; i < 2; i++ )
1886     {
1887         int i_part_cost;
1888         int i_part_cost_bi = 0;
1889         int stride[2] = {16,16};
1890         uint8_t *src[2];
1891
1892         /* TODO: check only the list(s) that were used in b8x8? */
1893         for( l = 0; l < 2; l++ )
1894         {
1895             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1896             x264_me_t *m = &lX->me16x8[i];
1897
1898             m->i_pixel = PIXEL_16x8;
1899             m->p_cost_mv = a->p_cost_mv;
1900
1901             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1902             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1903
1904             *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1905             *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1906
1907             x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1908             x264_me_search( h, m, mvc, 2 );
1909
1910             /* BI mode */
1911             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1912                                     m->mv[0], m->mv[1], 16, 8 );
1913             /* FIXME: ref cost */
1914             i_part_cost_bi += m->cost_mv;
1915         }
1916         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1917         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1918
1919         i_part_cost = a->l0.me16x8[i].cost;
1920         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1921         if( a->l1.me16x8[i].cost < i_part_cost )
1922         {
1923             i_part_cost = a->l1.me16x8[i].cost;
1924             a->i_mb_partition16x8[i] = D_L1_8x8;
1925         }
1926         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1927         {
1928             i_part_cost = i_part_cost_bi;
1929             a->i_mb_partition16x8[i] = D_BI_8x8;
1930         }
1931         a->i_cost16x8bi += i_part_cost;
1932
1933         x264_mb_cache_mv_b16x8( h, a, i, 0 );
1934     }
1935
1936     /* mb type cost */
1937     a->i_mb_type16x8 = B_L0_L0
1938         + (a->i_mb_partition16x8[0]>>2) * 3
1939         + (a->i_mb_partition16x8[1]>>2);
1940     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1941 }
1942
1943 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1944 {
1945     uint8_t **p_fref[2] =
1946         { h->mb.pic.p_fref[0][a->l0.i_ref],
1947           h->mb.pic.p_fref[1][a->l1.i_ref] };
1948     ALIGNED_8( uint8_t pix[2][8*16] );
1949     ALIGNED_4( int16_t mvc[2][2] );
1950     int i, l;
1951
1952     h->mb.i_partition = D_8x16;
1953     a->i_cost8x16bi = 0;
1954
1955     for( i = 0; i < 2; i++ )
1956     {
1957         int i_part_cost;
1958         int i_part_cost_bi = 0;
1959         int stride[2] = {8,8};
1960         uint8_t *src[2];
1961
1962         for( l = 0; l < 2; l++ )
1963         {
1964             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1965             x264_me_t *m = &lX->me8x16[i];
1966
1967             m->i_pixel = PIXEL_8x16;
1968             m->p_cost_mv = a->p_cost_mv;
1969
1970             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1971             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1972
1973             *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
1974             *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
1975
1976             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1977             x264_me_search( h, m, mvc, 2 );
1978
1979             /* BI mode */
1980             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
1981                                     m->mv[0], m->mv[1], 8, 16 );
1982             /* FIXME: ref cost */
1983             i_part_cost_bi += m->cost_mv;
1984         }
1985
1986         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1987         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1988
1989         i_part_cost = a->l0.me8x16[i].cost;
1990         a->i_mb_partition8x16[i] = D_L0_8x8;
1991         if( a->l1.me8x16[i].cost < i_part_cost )
1992         {
1993             i_part_cost = a->l1.me8x16[i].cost;
1994             a->i_mb_partition8x16[i] = D_L1_8x8;
1995         }
1996         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1997         {
1998             i_part_cost = i_part_cost_bi;
1999             a->i_mb_partition8x16[i] = D_BI_8x8;
2000         }
2001         a->i_cost8x16bi += i_part_cost;
2002
2003         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2004     }
2005
2006     /* mb type cost */
2007     a->i_mb_type8x16 = B_L0_L0
2008         + (a->i_mb_partition8x16[0]>>2) * 3
2009         + (a->i_mb_partition8x16[1]>>2);
2010     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2011 }
2012
2013 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2014 {
2015     int thresh = i_satd * 5/4;
2016
2017     h->mb.i_type = P_L0;
2018     if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2019     {
2020         h->mb.i_partition = D_16x16;
2021         x264_analyse_update_cache( h, a );
2022         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2023     }
2024
2025     if( a->l0.i_cost16x8 <= thresh )
2026     {
2027         h->mb.i_partition = D_16x8;
2028         x264_analyse_update_cache( h, a );
2029         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2030     }
2031     else
2032         a->l0.i_cost16x8 = COST_MAX;
2033
2034     if( a->l0.i_cost8x16 <= thresh )
2035     {
2036         h->mb.i_partition = D_8x16;
2037         x264_analyse_update_cache( h, a );
2038         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2039     }
2040     else
2041         a->l0.i_cost8x16 = COST_MAX;
2042
2043     if( a->l0.i_cost8x8 <= thresh )
2044     {
2045         h->mb.i_type = P_8x8;
2046         h->mb.i_partition = D_8x8;
2047         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2048         {
2049             int i;
2050             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2051             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2052             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2053             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2054             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2055              * for future blocks are those left over from previous RDO calls. */
2056             for( i = 0; i < 4; i++ )
2057             {
2058                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2059                 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2060                 int subtype, btype = D_L0_8x8;
2061                 uint64_t bcost = COST_MAX64;
2062                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2063                 {
2064                     uint64_t cost;
2065                     if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2066                         continue;
2067                     h->mb.i_sub_partition[i] = subtype;
2068                     x264_mb_cache_mv_p8x8( h, a, i );
2069                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2070                     COPY2_IF_LT( bcost, cost, btype, subtype );
2071                 }
2072                 h->mb.i_sub_partition[i] = btype;
2073                 x264_mb_cache_mv_p8x8( h, a, i );
2074             }
2075         }
2076         else
2077             x264_analyse_update_cache( h, a );
2078         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2079     }
2080     else
2081         a->l0.i_cost8x8 = COST_MAX;
2082 }
2083
2084 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2085 {
2086     int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2087
2088     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2089     {
2090         h->mb.i_type = B_DIRECT;
2091         /* Assumes direct/skip MC is still in fdec */
2092         /* Requires b-rdo to be done before intra analysis */
2093         h->mb.b_skip_mc = 1;
2094         x264_analyse_update_cache( h, a );
2095         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2096         h->mb.b_skip_mc = 0;
2097     }
2098
2099     //FIXME not all the update_cache calls are needed
2100     h->mb.i_partition = D_16x16;
2101     /* L0 */
2102     if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2103     {
2104         h->mb.i_type = B_L0_L0;
2105         x264_analyse_update_cache( h, a );
2106         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2107     }
2108
2109     /* L1 */
2110     if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2111     {
2112         h->mb.i_type = B_L1_L1;
2113         x264_analyse_update_cache( h, a );
2114         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2115     }
2116
2117     /* BI */
2118     if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2119     {
2120         h->mb.i_type = B_BI_BI;
2121         x264_analyse_update_cache( h, a );
2122         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2123     }
2124
2125     /* 8x8 */
2126     if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2127     {
2128         h->mb.i_type = B_8x8;
2129         h->mb.i_partition = D_8x8;
2130         x264_analyse_update_cache( h, a );
2131         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2132         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2133     }
2134
2135     /* 16x8 */
2136     if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2137     {
2138         h->mb.i_type = a->i_mb_type16x8;
2139         h->mb.i_partition = D_16x8;
2140         x264_analyse_update_cache( h, a );
2141         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2142     }
2143
2144     /* 8x16 */
2145     if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2146     {
2147         h->mb.i_type = a->i_mb_type8x16;
2148         h->mb.i_partition = D_8x16;
2149         x264_analyse_update_cache( h, a );
2150         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2151     }
2152 }
2153
2154 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2155 {
2156     const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2157     int i;
2158
2159     if( IS_INTRA(h->mb.i_type) )
2160         return;
2161
2162     switch( h->mb.i_partition )
2163     {
2164         case D_16x16:
2165             if( h->mb.i_type == B_BI_BI )
2166                 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2167             break;
2168         case D_16x8:
2169             for( i=0; i<2; i++ )
2170                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2171                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2172             break;
2173         case D_8x16:
2174             for( i=0; i<2; i++ )
2175                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2176                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2177             break;
2178         case D_8x8:
2179             for( i=0; i<4; i++ )
2180                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2181                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2182             break;
2183     }
2184 }
2185
2186 static inline void x264_mb_analyse_transform( x264_t *h )
2187 {
2188     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2189     {
2190         int i_cost4, i_cost8;
2191         /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2192         x264_mb_mc( h );
2193
2194         i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2195                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2196         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2197                                              h->mb.pic.p_fdec[0], FDEC_STRIDE );
2198
2199         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2200         h->mb.b_skip_mc = 1;
2201     }
2202 }
2203
2204 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2205 {
2206     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2207     {
2208         int i_rd8;
2209         x264_analyse_update_cache( h, a );
2210         h->mb.b_transform_8x8 ^= 1;
2211         /* FIXME only luma is needed, but the score for comparison already includes chroma */
2212         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2213
2214         if( *i_rd >= i_rd8 )
2215         {
2216             if( *i_rd > 0 )
2217                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2218             *i_rd = i_rd8;
2219         }
2220         else
2221             h->mb.b_transform_8x8 ^= 1;
2222     }
2223 }
2224
2225 /* Rate-distortion optimal QP selection.
2226  * FIXME: More than half of the benefit of this function seems to be
2227  * in the way it improves the coding of chroma DC (by decimating or
2228  * finding a better way to code a single DC coefficient.)
2229  * There must be a more efficient way to get that portion of the benefit
2230  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2231  * trick. */
2232 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2233 {
2234     int bcost, cost, direction, failures, prevcost, origcost;
2235     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2236     int last_qp_tried = 0;
2237     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2238
2239     /* If CBP is already zero, don't raise the quantizer any higher. */
2240     for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2241     {
2242         /* Without psy-RD, require monotonicity when moving quant away from previous
2243          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2244          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2245          * allow 2 failures when moving quant towards previous quant.
2246          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2247         int threshold = (!!h->mb.i_psy_rd);
2248         /* Raise the threshold for failures if we're moving towards the last QP. */
2249         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2250             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2251             threshold++;
2252         h->mb.i_qp = orig_qp;
2253         failures = 0;
2254         prevcost = origcost;
2255         h->mb.i_qp += direction;
2256         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2257         {
2258             if( h->mb.i_last_qp == h->mb.i_qp )
2259                 last_qp_tried = 1;
2260             h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2261             cost = x264_rd_cost_mb( h, a->i_lambda2 );
2262             COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2263
2264             /* We can't assume that the costs are monotonic over QPs.
2265              * Tie case-as-failure seems to give better results. */
2266             if( cost < prevcost )
2267                 failures = 0;
2268             else
2269                 failures++;
2270             prevcost = cost;
2271
2272             if( failures > threshold )
2273                 break;
2274             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2275                 break;
2276             h->mb.i_qp += direction;
2277         }
2278     }
2279
2280     /* Always try the last block's QP. */
2281     if( !last_qp_tried )
2282     {
2283         h->mb.i_qp = h->mb.i_last_qp;
2284         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2285         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2286         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2287     }
2288
2289     h->mb.i_qp = bqp;
2290     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2291
2292     /* Check transform again; decision from before may no longer be optimal. */
2293     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2294         x264_mb_transform_8x8_allowed( h ) )
2295     {
2296         h->mb.b_transform_8x8 ^= 1;
2297         cost = x264_rd_cost_mb( h, a->i_lambda2 );
2298         if( cost > bcost )
2299             h->mb.b_transform_8x8 ^= 1;
2300     }
2301 }
2302
2303 /*****************************************************************************
2304  * x264_macroblock_analyse:
2305  *****************************************************************************/
2306 int x264_macroblock_analyse( x264_t *h )
2307 {
2308     x264_mb_analysis_t analysis;
2309     int i_cost = COST_MAX;
2310     int i;
2311
2312     h->mb.i_qp = x264_ratecontrol_qp( h );
2313     if( h->param.rc.i_aq_mode )
2314     {
2315         x264_adaptive_quant( h );
2316         /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2317          * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
2318         if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2319             h->mb.i_qp = h->mb.i_last_qp;
2320     }
2321
2322     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2323
2324     /*--------------------------- Do the analysis ---------------------------*/
2325     if( h->sh.i_type == SLICE_TYPE_I )
2326     {
2327         if( analysis.i_mbrd )
2328             x264_mb_cache_fenc_satd( h );
2329         x264_mb_analyse_intra( h, &analysis, COST_MAX );
2330         if( analysis.i_mbrd )
2331             x264_intra_rd( h, &analysis, COST_MAX );
2332
2333         i_cost = analysis.i_satd_i16x16;
2334         h->mb.i_type = I_16x16;
2335         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2336         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2337         if( analysis.i_satd_pcm < i_cost )
2338             h->mb.i_type = I_PCM;
2339
2340         else if( analysis.i_mbrd >= 2 )
2341             x264_intra_rd_refine( h, &analysis );
2342     }
2343     else if( h->sh.i_type == SLICE_TYPE_P )
2344     {
2345         int b_skip = 0;
2346
2347         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2348
2349         /* Fast P_SKIP detection */
2350         analysis.b_try_pskip = 0;
2351         if( h->param.analyse.b_fast_pskip )
2352         {
2353             if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2354                 // FIXME don't need to check this if the reference frame is done
2355                 {}
2356             else if( h->param.analyse.i_subpel_refine >= 3 )
2357                 analysis.b_try_pskip = 1;
2358             else if( h->mb.i_mb_type_left == P_SKIP ||
2359                      h->mb.i_mb_type_top == P_SKIP ||
2360                      h->mb.i_mb_type_topleft == P_SKIP ||
2361                      h->mb.i_mb_type_topright == P_SKIP )
2362                 b_skip = x264_macroblock_probe_pskip( h );
2363         }
2364
2365         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2366
2367         if( b_skip )
2368         {
2369             h->mb.i_type = P_SKIP;
2370             h->mb.i_partition = D_16x16;
2371             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2372         }
2373         else
2374         {
2375             const unsigned int flags = h->param.analyse.inter;
2376             int i_type;
2377             int i_partition;
2378             int i_thresh16x8;
2379             int i_satd_inter, i_satd_intra;
2380
2381             if( x264_mb_analyse_load_costs( h, &analysis ) )
2382                 return -1;
2383
2384             x264_mb_analyse_inter_p16x16( h, &analysis );
2385
2386             if( h->mb.i_type == P_SKIP )
2387                 return 0;
2388
2389             if( flags & X264_ANALYSE_PSUB16x16 )
2390             {
2391                 if( h->param.analyse.b_mixed_references )
2392                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2393                 else
2394                     x264_mb_analyse_inter_p8x8( h, &analysis );
2395             }
2396
2397             /* Select best inter mode */
2398             i_type = P_L0;
2399             i_partition = D_16x16;
2400             i_cost = analysis.l0.me16x16.cost;
2401
2402             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2403                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2404             {
2405                 i_type = P_8x8;
2406                 i_partition = D_8x8;
2407                 i_cost = analysis.l0.i_cost8x8;
2408
2409                 /* Do sub 8x8 */
2410                 if( flags & X264_ANALYSE_PSUB8x8 )
2411                 {
2412                     for( i = 0; i < 4; i++ )
2413                     {
2414                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
2415                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2416                         {
2417                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
2418                             h->mb.i_sub_partition[i] = D_L0_4x4;
2419
2420                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
2421                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2422                                          h->mb.i_sub_partition[i], D_L0_8x4 );
2423
2424                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
2425                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2426                                          h->mb.i_sub_partition[i], D_L0_4x8 );
2427
2428                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2429                         }
2430                         x264_mb_cache_mv_p8x8( h, &analysis, i );
2431                     }
2432                     analysis.l0.i_cost8x8 = i_cost;
2433                 }
2434             }
2435
2436             /* Now do 16x8/8x16 */
2437             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2438             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2439                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2440             {
2441                 x264_mb_analyse_inter_p16x8( h, &analysis );
2442                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2443
2444                 x264_mb_analyse_inter_p8x16( h, &analysis );
2445                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2446             }
2447
2448             h->mb.i_partition = i_partition;
2449
2450             /* refine qpel */
2451             //FIXME mb_type costs?
2452             if( analysis.i_mbrd )
2453             {
2454                 /* refine later */
2455             }
2456             else if( i_partition == D_16x16 )
2457             {
2458                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2459                 i_cost = analysis.l0.me16x16.cost;
2460             }
2461             else if( i_partition == D_16x8 )
2462             {
2463                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2464                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2465                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2466             }
2467             else if( i_partition == D_8x16 )
2468             {
2469                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2470                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2471                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2472             }
2473             else if( i_partition == D_8x8 )
2474             {
2475                 int i8x8;
2476                 i_cost = 0;
2477                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2478                 {
2479                     switch( h->mb.i_sub_partition[i8x8] )
2480                     {
2481                         case D_L0_8x8:
2482                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2483                             i_cost += analysis.l0.me8x8[i8x8].cost;
2484                             break;
2485                         case D_L0_8x4:
2486                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2487                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2488                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
2489                                       analysis.l0.me8x4[i8x8][1].cost;
2490                             break;
2491                         case D_L0_4x8:
2492                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2493                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2494                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
2495                                       analysis.l0.me4x8[i8x8][1].cost;
2496                             break;
2497
2498                         case D_L0_4x4:
2499                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2500                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2501                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2502                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2503                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
2504                                       analysis.l0.me4x4[i8x8][1].cost +
2505                                       analysis.l0.me4x4[i8x8][2].cost +
2506                                       analysis.l0.me4x4[i8x8][3].cost;
2507                             break;
2508                         default:
2509                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2510                             break;
2511                     }
2512                 }
2513             }
2514
2515             if( h->mb.b_chroma_me )
2516             {
2517                 x264_mb_analyse_intra_chroma( h, &analysis );
2518                 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2519                 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2520                 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2521                 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2522             }
2523             else
2524                 x264_mb_analyse_intra( h, &analysis, i_cost );
2525
2526             i_satd_inter = i_cost;
2527             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2528                                       analysis.i_satd_i8x8,
2529                                       analysis.i_satd_i4x4 );
2530
2531             if( analysis.i_mbrd )
2532             {
2533                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2534                 i_type = P_L0;
2535                 i_partition = D_16x16;
2536                 i_cost = analysis.l0.i_rd16x16;
2537                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2538                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2539                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2540                 h->mb.i_type = i_type;
2541                 h->mb.i_partition = i_partition;
2542                 if( i_cost < COST_MAX )
2543                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2544                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2545             }
2546
2547             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2548             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2549             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2550             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2551
2552             h->mb.i_type = i_type;
2553
2554             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2555             {
2556                 if( IS_INTRA( h->mb.i_type ) )
2557                 {
2558                     x264_intra_rd_refine( h, &analysis );
2559                 }
2560                 else if( i_partition == D_16x16 )
2561                 {
2562                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2563                     analysis.l0.me16x16.cost = analysis.l0.i_rd16x16;
2564                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2565                 }
2566                 else if( i_partition == D_16x8 )
2567                 {
2568                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2569                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2570                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2571                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2572                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2573                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2574                 }
2575                 else if( i_partition == D_8x16 )
2576                 {
2577                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2578                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2579                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2580                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2581                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2582                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2583                 }
2584                 else if( i_partition == D_8x8 )
2585                 {
2586                     int i8x8;
2587                     x264_analyse_update_cache( h, &analysis );
2588                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
2589                     {
2590                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2591                         {
2592                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2593                         }
2594                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2595                         {
2596                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2597                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2598                         }
2599                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2600                         {
2601                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2602                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2603                         }
2604                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2605                         {
2606                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2607                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2608                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2609                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2610                         }
2611                     }
2612                 }
2613             }
2614         }
2615     }
2616     else if( h->sh.i_type == SLICE_TYPE_B )
2617     {
2618         int i_bskip_cost = COST_MAX;
2619         int b_skip = 0;
2620
2621         if( analysis.i_mbrd )
2622             x264_mb_cache_fenc_satd( h );
2623
2624         h->mb.i_type = B_SKIP;
2625         if( h->mb.b_direct_auto_write )
2626         {
2627             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2628             for( i = 0; i < 2; i++ )
2629             {
2630                 int b_changed = 1;
2631                 h->sh.b_direct_spatial_mv_pred ^= 1;
2632                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2633                 if( analysis.b_direct_available )
2634                 {
2635                     if( b_changed )
2636                     {
2637                         x264_mb_mc( h );
2638                         b_skip = x264_macroblock_probe_bskip( h );
2639                     }
2640                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2641                 }
2642                 else
2643                     b_skip = 0;
2644             }
2645         }
2646         else
2647             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2648
2649         if( analysis.b_direct_available )
2650         {
2651             if( !h->mb.b_direct_auto_write )
2652                 x264_mb_mc( h );
2653             if( analysis.i_mbrd )
2654             {
2655                 i_bskip_cost = ssd_mb( h );
2656                 /* 6 = minimum cavlc cost of a non-skipped MB */
2657                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2658             }
2659             else if( !h->mb.b_direct_auto_write )
2660             {
2661                 /* Conditioning the probe on neighboring block types
2662                  * doesn't seem to help speed or quality. */
2663                 b_skip = x264_macroblock_probe_bskip( h );
2664             }
2665         }
2666
2667         if( !b_skip )
2668         {
2669             const unsigned int flags = h->param.analyse.inter;
2670             int i_type;
2671             int i_partition;
2672             int i_satd_inter;
2673             h->mb.b_skip_mc = 0;
2674
2675             if( x264_mb_analyse_load_costs( h, &analysis ) )
2676                 return -1;
2677
2678             /* select best inter mode */
2679             /* direct must be first */
2680             if( analysis.b_direct_available )
2681                 x264_mb_analyse_inter_direct( h, &analysis );
2682
2683             x264_mb_analyse_inter_b16x16( h, &analysis );
2684
2685             i_type = B_L0_L0;
2686             i_partition = D_16x16;
2687             i_cost = analysis.l0.me16x16.cost;
2688             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2689             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2690             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2691
2692             if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2693             {
2694                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2695                 if( i_bskip_cost < analysis.i_rd16x16direct &&
2696                     i_bskip_cost < analysis.i_rd16x16bi &&
2697                     i_bskip_cost < analysis.l0.i_rd16x16 &&
2698                     i_bskip_cost < analysis.l1.i_rd16x16 )
2699                 {
2700                     h->mb.i_type = B_SKIP;
2701                     x264_analyse_update_cache( h, &analysis );
2702                     return 0;
2703                 }
2704             }
2705
2706             if( flags & X264_ANALYSE_BSUB16x16 )
2707             {
2708                 x264_mb_analyse_inter_b8x8( h, &analysis );
2709                 if( analysis.i_cost8x8bi < i_cost )
2710                 {
2711                     i_type = B_8x8;
2712                     i_partition = D_8x8;
2713                     i_cost = analysis.i_cost8x8bi;
2714
2715                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2716                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2717                     {
2718                         x264_mb_analyse_inter_b16x8( h, &analysis );
2719                         COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2720                                      i_type, analysis.i_mb_type16x8,
2721                                      i_partition, D_16x8 );
2722                     }
2723                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2724                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2725                     {
2726                         x264_mb_analyse_inter_b8x16( h, &analysis );
2727                         COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2728                                      i_type, analysis.i_mb_type8x16,
2729                                      i_partition, D_8x16 );
2730                     }
2731                 }
2732             }
2733
2734             if( analysis.i_mbrd )
2735             {
2736                 /* refine later */
2737             }
2738             /* refine qpel */
2739             else if( i_partition == D_16x16 )
2740             {
2741                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2742                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2743                 if( i_type == B_L0_L0 )
2744                 {
2745                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2746                     i_cost = analysis.l0.me16x16.cost
2747                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2748                 }
2749                 else if( i_type == B_L1_L1 )
2750                 {
2751                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2752                     i_cost = analysis.l1.me16x16.cost
2753                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2754                 }
2755                 else if( i_type == B_BI_BI )
2756                 {
2757                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2758                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2759                 }
2760             }
2761             else if( i_partition == D_16x8 )
2762             {
2763                 for( i=0; i<2; i++ )
2764                 {
2765                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2766                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2767                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2768                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2769                 }
2770             }
2771             else if( i_partition == D_8x16 )
2772             {
2773                 for( i=0; i<2; i++ )
2774                 {
2775                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2776                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2777                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2778                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2779                 }
2780             }
2781             else if( i_partition == D_8x8 )
2782             {
2783                 for( i=0; i<4; i++ )
2784                 {
2785                     x264_me_t *m;
2786                     int i_part_cost_old;
2787                     int i_type_cost;
2788                     int i_part_type = h->mb.i_sub_partition[i];
2789                     int b_bidir = (i_part_type == D_BI_8x8);
2790
2791                     if( i_part_type == D_DIRECT_8x8 )
2792                         continue;
2793                     if( x264_mb_partition_listX_table[0][i_part_type] )
2794                     {
2795                         m = &analysis.l0.me8x8[i];
2796                         i_part_cost_old = m->cost;
2797                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2798                         m->cost -= i_type_cost;
2799                         x264_me_refine_qpel( h, m );
2800                         if( !b_bidir )
2801                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2802                     }
2803                     if( x264_mb_partition_listX_table[1][i_part_type] )
2804                     {
2805                         m = &analysis.l1.me8x8[i];
2806                         i_part_cost_old = m->cost;
2807                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2808                         m->cost -= i_type_cost;
2809                         x264_me_refine_qpel( h, m );
2810                         if( !b_bidir )
2811                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2812                     }
2813                     /* TODO: update mvp? */
2814                 }
2815             }
2816
2817             i_satd_inter = i_cost;
2818
2819             if( analysis.i_mbrd )
2820             {
2821                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2822                 i_type = B_SKIP;
2823                 i_cost = i_bskip_cost;
2824                 i_partition = D_16x16;
2825                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2826                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2827                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2828                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2829                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2830                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2831                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2832
2833                 h->mb.i_type = i_type;
2834                 h->mb.i_partition = i_partition;
2835             }
2836
2837             x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2838
2839             if( analysis.i_mbrd )
2840             {
2841                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2842                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2843             }
2844
2845             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2846             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2847             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2848             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2849
2850             h->mb.i_type = i_type;
2851             h->mb.i_partition = i_partition;
2852
2853             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2854                 x264_intra_rd_refine( h, &analysis );
2855             if( h->mb.i_subpel_refine >= 5 )
2856                 x264_refine_bidir( h, &analysis );
2857
2858             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2859             {
2860                 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2861                 x264_analyse_update_cache( h, &analysis );
2862
2863                 if( i_partition == D_16x16 )
2864                 {
2865                     if( i_type == B_L0_L0 )
2866                     {
2867                         analysis.l0.me16x16.cost = analysis.l0.i_rd16x16;
2868                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2869                     }
2870                     else if( i_type == B_L1_L1 )
2871                     {
2872                         analysis.l1.me16x16.cost = analysis.l1.i_rd16x16;
2873                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2874                     }
2875                     else if( i_type == B_BI_BI )
2876                         x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2877                 }
2878                 else if( i_partition == D_16x8 )
2879                 {
2880                     for( i = 0; i < 2; i++ )
2881                     {
2882                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2883                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2884                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2885                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2886                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2887                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2888                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2889                     }
2890                 }
2891                 else if( i_partition == D_8x16 )
2892                 {
2893                     for( i = 0; i < 2; i++ )
2894                     {
2895                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2896                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2897                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2898                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2899                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2900                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2901                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2902                     }
2903                 }
2904                 else if( i_partition == D_8x8 )
2905                 {
2906                     for( i = 0; i < 4; i++ )
2907                     {
2908                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2909                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2910                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2911                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2912                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2913                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2914                     }
2915                 }
2916             }
2917         }
2918     }
2919
2920     x264_analyse_update_cache( h, &analysis );
2921
2922     if( !analysis.i_mbrd )
2923         x264_mb_analyse_transform( h );
2924
2925     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2926         x264_mb_analyse_qp_rd( h, &analysis );
2927
2928     h->mb.b_trellis = h->param.analyse.i_trellis;
2929     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2930     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2931         x264_psy_trellis_init( h, 0 );
2932     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2933         h->mb.i_skip_intra = 0;
2934     return 0;
2935 }
2936
2937 /*-------------------- Update MB from the analysis ----------------------*/
2938 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
2939 {
2940     int i;
2941
2942     switch( h->mb.i_type )
2943     {
2944         case I_4x4:
2945             for( i = 0; i < 16; i++ )
2946                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2947
2948             x264_mb_analyse_intra_chroma( h, a );
2949             break;
2950         case I_8x8:
2951             for( i = 0; i < 4; i++ )
2952                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2953
2954             x264_mb_analyse_intra_chroma( h, a );
2955             break;
2956         case I_16x16:
2957             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2958             x264_mb_analyse_intra_chroma( h, a );
2959             break;
2960
2961         case I_PCM:
2962             break;
2963
2964         case P_L0:
2965             switch( h->mb.i_partition )
2966             {
2967                 case D_16x16:
2968                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2969                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2970                     break;
2971
2972                 case D_16x8:
2973                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2974                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2975                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
2976                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
2977                     break;
2978
2979                 case D_8x16:
2980                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2981                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2982                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
2983                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
2984                     break;
2985
2986                 default:
2987                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
2988                     break;
2989             }
2990             break;
2991
2992         case P_8x8:
2993             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2994             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2995             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2996             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2997             for( i = 0; i < 4; i++ )
2998                 x264_mb_cache_mv_p8x8( h, a, i );
2999             break;
3000
3001         case P_SKIP:
3002         {
3003             h->mb.i_partition = D_16x16;
3004             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3005             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3006             break;
3007         }
3008
3009         case B_SKIP:
3010         case B_DIRECT:
3011             x264_mb_load_mv_direct8x8( h, 0 );
3012             x264_mb_load_mv_direct8x8( h, 1 );
3013             x264_mb_load_mv_direct8x8( h, 2 );
3014             x264_mb_load_mv_direct8x8( h, 3 );
3015             break;
3016
3017         case B_8x8:
3018             /* optimize: cache might not need to be rewritten */
3019             for( i = 0; i < 4; i++ )
3020                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3021             break;
3022
3023         default: /* the rest of the B types */
3024             switch( h->mb.i_partition )
3025             {
3026             case D_16x16:
3027                 switch( h->mb.i_type )
3028                 {
3029                 case B_L0_L0:
3030                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3031                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3032
3033                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3034                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3035                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3036                     break;
3037                 case B_L1_L1:
3038                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3039                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3040                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3041
3042                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3043                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3044                     break;
3045                 case B_BI_BI:
3046                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3047                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3048
3049                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3050                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3051                     break;
3052                 }
3053                 break;
3054             case D_16x8:
3055                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3056                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3057                 break;
3058             case D_8x16:
3059                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3060                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3061                 break;
3062             default:
3063                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3064                 break;
3065             }
3066     }
3067
3068 #ifndef NDEBUG
3069     if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3070     {
3071         int l;
3072         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3073         {
3074             int completed;
3075             int ref = h->mb.cache.ref[l][x264_scan8[0]];
3076             if( ref < 0 )
3077                 continue;
3078             completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3079             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3080             {
3081                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3082                 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3083                 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3084                                 h->mb.cache.mv[l][x264_scan8[15]][0],
3085                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
3086                 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3087                 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3088                 fprintf(stderr, "completed: %d \n", completed );
3089                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3090                 x264_mb_analyse_intra( h, a, COST_MAX );
3091                 h->mb.i_type = I_16x16;
3092                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3093                 x264_mb_analyse_intra_chroma( h, a );
3094             }
3095         }
3096     }
3097 #endif
3098 }
3099
3100 #include "slicetype.c"
3101