git.sesse.net Git - x264/blob - encoder/analyse.c

   1 /*****************************************************************************
   2  * analyse.c: macroblock analysis
   3  *****************************************************************************
   4  * Copyright (C) 2003-2015 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #define _ISOC99_SOURCE
  29
  30 #include "common/common.h"
  31 #include "macroblock.h"
  32 #include "me.h"
  33 #include "ratecontrol.h"
  34 #include "analyse.h"
  35 #include "rdo.c"
  36
  37 typedef struct
  38 {
  39     /* 16x16 */
  40     int       i_rd16x16;
  41     x264_me_t me16x16;
  42     x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  43
  44     /* 8x8 */
  45     int       i_cost8x8;
  46     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  47     ALIGNED_4( int16_t mvc[32][5][2] );
  48     x264_me_t me8x8[4];
  49
  50     /* Sub 4x4 */
  51     int       i_cost4x4[4]; /* cost per 8x8 partition */
  52     x264_me_t me4x4[4][4];
  53
  54     /* Sub 8x4 */
  55     int       i_cost8x4[4]; /* cost per 8x8 partition */
  56     x264_me_t me8x4[4][2];
  57
  58     /* Sub 4x8 */
  59     int       i_cost4x8[4]; /* cost per 8x8 partition */
  60     x264_me_t me4x8[4][2];
  61
  62     /* 16x8 */
  63     int       i_cost16x8;
  64     x264_me_t me16x8[2];
  65
  66     /* 8x16 */
  67     int       i_cost8x16;
  68     x264_me_t me8x16[2];
  69
  70 } x264_mb_analysis_list_t;
  71
  72 typedef struct
  73 {
  74     /* conduct the analysis using this lamda and QP */
  75     int i_lambda;
  76     int i_lambda2;
  77     int i_qp;
  78     uint16_t *p_cost_mv;
  79     uint16_t *p_cost_ref[2];
  80     int i_mbrd;
  81
  82
  83     /* I: Intra part */
  84     /* Take some shortcuts in intra search if intra is deemed unlikely */
  85     int b_fast_intra;
  86     int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
  87     int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
  88     int b_try_skip;
  89
  90     /* Luma part */
  91     int i_satd_i16x16;
  92     int i_satd_i16x16_dir[7];
  93     int i_predict16x16;
  94
  95     int i_satd_i8x8;
  96     int i_cbp_i8x8_luma;
  97     ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
  98     int i_predict8x8[4];
  99
 100     int i_satd_i4x4;
 101     int i_predict4x4[16];
 102
 103     int i_satd_pcm;
 104
 105     /* Chroma part */
 106     int i_satd_chroma;
 107     int i_satd_chroma_dir[7];
 108     int i_predict8x8chroma;
 109
 110     /* II: Inter part P/B frame */
 111     x264_mb_analysis_list_t l0;
 112     x264_mb_analysis_list_t l1;
 113
 114     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
 115     int i_cost16x16direct;
 116     int i_cost8x8bi;
 117     int i_cost8x8direct[4];
 118     int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
 119     int i_cost_est16x8[2]; /* Per-partition estimated cost */
 120     int i_cost_est8x16[2];
 121     int i_cost16x8bi;
 122     int i_cost8x16bi;
 123     int i_rd16x16bi;
 124     int i_rd16x16direct;
 125     int i_rd16x8bi;
 126     int i_rd8x16bi;
 127     int i_rd8x8bi;
 128
 129     int i_mb_partition16x8[2]; /* mb_partition_e */
 130     int i_mb_partition8x16[2];
 131     int i_mb_type16x8; /* mb_class_e */
 132     int i_mb_type8x16;
 133
 134     int b_direct_available;
 135     int b_early_terminate;
 136
 137 } x264_mb_analysis_t;
 138
 139 /* lambda = pow(2,qp/6-2) */
 140 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
 141 {
 142    1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
 143    1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
 144    2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
 145    4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
 146   10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
 147   25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
 148   64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
 149  161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
 150  406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
 151 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
 152 2048,2299,                               /* 80-81 */
 153 };
 154
 155 /* lambda2 = pow(lambda,2) * .9 * 256 */
 156 /* Capped to avoid overflow */
 157 const int x264_lambda2_tab[QP_MAX_MAX+1] =
 158 {
 159        14,       18,       22,       28,       36,       45,      57,      72, /*  0- 7 */
 160        91,      115,      145,      182,      230,      290,     365,     460, /*  8-15 */
 161       580,      731,      921,     1161,     1462,     1843,    2322,    2925, /* 16-23 */
 162      3686,     4644,     5851,     7372,     9289,    11703,   14745,   18578, /* 24-31 */
 163     23407,    29491,    37156,    46814,    58982,    74313,   93628,  117964, /* 32-39 */
 164    148626,   187257,   235929,   297252,   374514,   471859,  594505,  749029, /* 40-47 */
 165    943718,  1189010,  1498059,  1887436,  2378021,  2996119, 3774873, 4756042, /* 48-55 */
 166   5992238,  7549747,  9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
 167  38048341, 47937906, 60397977, 76096683, 95875813,120795955,                   /* 64-69 */
 168 134217727,134217727,134217727,134217727,134217727,134217727,                   /* 70-75 */
 169 134217727,134217727,134217727,134217727,134217727,134217727,                   /* 76-81 */
 170 };
 171
 172 const uint8_t x264_exp2_lut[64] =
 173 {
 174       0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
 175      48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
 176     106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
 177     175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
 178 };
 179
 180 const float x264_log2_lut[128] =
 181 {
 182     0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
 183     0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
 184     0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
 185     0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
 186     0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
 187     0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
 188     0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
 189     0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
 190     0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
 191     0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
 192     0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
 193     0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
 194     0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
 195     0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
 196     0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
 197     0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
 198 };
 199
 200 /* Avoid an int/float conversion. */
 201 const float x264_log2_lz_lut[32] =
 202 {
 203     31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 204 };
 205
 206 // should the intra and inter lambdas be different?
 207 // I'm just matching the behaviour of deadzone quant.
 208 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
 209 {
 210     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 211     {
 212                46,       58,       73,       92,      117,      147,
 213               185,      233,      294,      370,      466,      587,
 214               740,      932,     1174,     1480,     1864,     2349,
 215              2959,     3728,     4697,     5918,     7457,     9395,
 216             11837,    14914,    18790,    23674,    29828,    37581,
 217             47349,    59656,    75163,    94699,   119313,   150326,
 218            189399,   238627,   300652,   378798,   477255,   601304,
 219            757596,   954511,  1202608,  1515192,  1909022,  2405217,
 220           3030384,  3818045,  4810435,  6060769,  7636091,  9620872,
 221          12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
 222          48486154, 61088726, 76966972, 96972308,
 223         122177453,134217727,134217727,134217727,134217727,134217727,
 224         134217727,134217727,134217727,134217727,134217727,134217727,
 225     },
 226     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 227     {
 228                27,       34,       43,       54,       68,       86,
 229               108,      136,      172,      216,      273,      343,
 230               433,      545,      687,      865,     1090,     1374,
 231              1731,     2180,     2747,     3461,     4361,     5494,
 232              6922,     8721,    10988,    13844,    17442,    21976,
 233             27688,    34885,    43953,    55377,    69771,    87906,
 234            110755,   139543,   175813,   221511,   279087,   351627,
 235            443023,   558174,   703255,   886046,  1116348,  1406511,
 236           1772093,  2232697,  2813022,  3544186,  4465396,  5626046,
 237           7088374,  8930791, 11252092, 14176748, 17861583, 22504184,
 238          28353495, 35723165, 45008368, 56706990,
 239          71446330, 90016736,113413980,134217727,134217727,134217727,
 240         134217727,134217727,134217727,134217727,134217727,134217727,
 241         134217727,134217727,134217727,134217727,134217727,134217727,
 242     }
 243 };
 244
 245 #define MAX_CHROMA_LAMBDA_OFFSET 36
 246 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
 247 {
 248        16,    20,    25,    32,    40,    50,
 249        64,    80,   101,   128,   161,   203,
 250       256,   322,   406,   512,   645,   812,
 251      1024,  1290,  1625,  2048,  2580,  3250,
 252      4096,  5160,  6501,  8192, 10321, 13003,
 253     16384, 20642, 26007, 32768, 41285, 52015,
 254     65535
 255 };
 256
 257 /* TODO: calculate CABAC costs */
 258 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
 259 {
 260     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 261 };
 262 static const uint8_t i_mb_b16x8_cost_table[17] =
 263 {
 264     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 265 };
 266 static const uint8_t i_sub_mb_b_cost_table[13] =
 267 {
 268     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 269 };
 270 static const uint8_t i_sub_mb_p_cost_table[4] =
 271 {
 272     5, 3, 3, 1
 273 };
 274
 275 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 276
 277 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
 278 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 279 static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
 280
 281 static int init_costs( x264_t *h, float *logs, int qp )
 282 {
 283     int lambda = x264_lambda_tab[qp];
 284     if( h->cost_mv[qp] )
 285         return 0;
 286     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
 287     CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
 288     h->cost_mv[qp] += 2*4*2048;
 289     for( int i = 0; i <= 2*4*2048; i++ )
 290     {
 291         h->cost_mv[qp][-i] =
 292         h->cost_mv[qp][i]  = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
 293     }
 294     x264_pthread_mutex_lock( &cost_ref_mutex );
 295     for( int i = 0; i < 3; i++ )
 296         for( int j = 0; j < 33; j++ )
 297             x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
 298     x264_pthread_mutex_unlock( &cost_ref_mutex );
 299     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
 300     {
 301         for( int j = 0; j < 4; j++ )
 302         {
 303             CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
 304             h->cost_mv_fpel[qp][j] += 2*2048;
 305             for( int i = -2*2048; i < 2*2048; i++ )
 306                 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
 307         }
 308     }
 309     uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
 310     for( int i = 0; i < 17; i++ )
 311         cost_i4x4_mode[i] = 3*lambda*(i!=8);
 312     return 0;
 313 fail:
 314     return -1;
 315 }
 316
 317 int x264_analyse_init_costs( x264_t *h )
 318 {
 319     float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
 320     if( !logs )
 321         return -1;
 322
 323     logs[0] = 0.718f;
 324     for( int i = 1; i <= 2*4*2048; i++ )
 325         logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
 326
 327     for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
 328         if( init_costs( h, logs, qp ) )
 329             goto fail;
 330
 331     if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
 332         goto fail;
 333
 334     x264_free( logs );
 335     return 0;
 336 fail:
 337     x264_free( logs );
 338     return -1;
 339 }
 340
 341 void x264_analyse_free_costs( x264_t *h )
 342 {
 343     for( int i = 0; i < QP_MAX+1; i++ )
 344     {
 345         if( h->cost_mv[i] )
 346             x264_free( h->cost_mv[i] - 2*4*2048 );
 347         if( h->cost_mv_fpel[i][0] )
 348             for( int j = 0; j < 4; j++ )
 349                 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
 350     }
 351 }
 352
 353 void x264_analyse_weight_frame( x264_t *h, int end )
 354 {
 355     for( int j = 0; j < h->i_ref[0]; j++ )
 356     {
 357         if( h->sh.weight[j][0].weightfn )
 358         {
 359             x264_frame_t *frame = h->fref[0][j];
 360             int width = frame->i_width[0] + 2*PADH;
 361             int i_padv = PADV << PARAM_INTERLACED;
 362             int offset, height;
 363             pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
 364             height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
 365             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
 366             h->fenc->i_lines_weighted += height;
 367             if( height )
 368                 for( int k = j; k < h->i_ref[0]; k++ )
 369                     if( h->sh.weight[k][0].weightfn )
 370                     {
 371                         pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
 372                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
 373                                                  src + offset, frame->i_stride[0],
 374                                                  width, height, &h->sh.weight[k][0] );
 375                     }
 376             break;
 377         }
 378     }
 379 }
 380
 381 /* initialize an array of lambda*nbits for all possible mvs */
 382 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 383 {
 384     a->p_cost_mv = h->cost_mv[a->i_qp];
 385     a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
 386     a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 387 }
 388
 389 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
 390 {
 391     int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
 392     a->i_lambda = x264_lambda_tab[qp];
 393     a->i_lambda2 = x264_lambda2_tab[qp];
 394
 395     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
 396     if( h->param.analyse.i_trellis )
 397     {
 398         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
 399         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
 400         h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
 401         h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
 402     }
 403     h->mb.i_psy_rd_lambda = a->i_lambda;
 404     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
 405     int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
 406     h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
 407
 408     if( qp > QP_MAX_SPEC )
 409     {
 410         h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
 411         h->nr_residual_sum = h->nr_residual_sum_buf[1];
 412         h->nr_count = h->nr_count_buf[1];
 413         h->mb.b_noise_reduction = 1;
 414         qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
 415     }
 416     else
 417     {
 418         h->nr_offset = h->nr_offset_denoise;
 419         h->nr_residual_sum = h->nr_residual_sum_buf[0];
 420         h->nr_count = h->nr_count_buf[0];
 421         h->mb.b_noise_reduction = 0;
 422     }
 423
 424     a->i_qp = h->mb.i_qp = qp;
 425     h->mb.i_chroma_qp = h->chroma_qp_table[qp];
 426 }
 427
 428 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
 429 {
 430     int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
 431
 432     /* mbrd == 1 -> RD mode decision */
 433     /* mbrd == 2 -> RD refinement */
 434     /* mbrd == 3 -> QPRD */
 435     a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
 436     h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
 437     a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
 438
 439     x264_mb_analyse_init_qp( h, a, qp );
 440
 441     h->mb.b_transform_8x8 = 0;
 442
 443     /* I: Intra part */
 444     a->i_satd_i16x16 =
 445     a->i_satd_i8x8   =
 446     a->i_satd_i4x4   =
 447     a->i_satd_chroma = COST_MAX;
 448
 449     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
 450      * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
 451     uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
 452     a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
 453
 454     a->b_fast_intra = 0;
 455     a->b_avoid_topright = 0;
 456     h->mb.i_skip_intra =
 457         h->mb.b_lossless ? 0 :
 458         a->i_mbrd ? 2 :
 459         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
 460
 461     /* II: Inter part P/B frame */
 462     if( h->sh.i_type != SLICE_TYPE_I )
 463     {
 464         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
 465         // limit motion search to a slightly smaller range than the theoretical limit,
 466         // since the search may go a few iterations past its given range
 467         int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 468
 469         /* Calculate max allowed MV range */
 470 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
 471         h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
 472         h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
 473         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
 474         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
 475         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
 476         {
 477             int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
 478             int max_mv = max_x - 4*16*h->mb.i_mb_x;
 479             /* If we're left of the refresh bar, don't reference right of it. */
 480             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 481                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 482         }
 483         h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
 484         h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 485         if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
 486         {
 487             int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
 488             int thread_mvy_range = i_fmv_range;
 489
 490             if( h->i_thread_frames > 1 )
 491             {
 492                 int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
 493                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
 494                 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
 495                     for( int j = 0; j < h->i_ref[i]; j++ )
 496                     {
 497                         x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
 498                         thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
 499                     }
 500
 501                 if( h->param.b_deterministic )
 502                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
 503                 if( PARAM_INTERLACED )
 504                     thread_mvy_range >>= 1;
 505
 506                 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
 507             }
 508
 509             if( PARAM_INTERLACED )
 510             {
 511                 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
 512                 for( int i = 0; i < 3; i++ )
 513                 {
 514                     int j = i == 2;
 515                     mb_y = (h->mb.i_mb_y >> j) + (i == 1);
 516                     h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
 517                     h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
 518                     h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
 519                     h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
 520                     h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
 521                     h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
 522                     h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
 523                 }
 524             }
 525             else
 526             {
 527                 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
 528                 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
 529                 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 530                 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 531                 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
 532                 h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
 533                 h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 534             }
 535         }
 536         if( PARAM_INTERLACED )
 537         {
 538             int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
 539             h->mb.mv_min[1] = h->mb.mv_miny_row[i];
 540             h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
 541             h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
 542             h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
 543             h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
 544             h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
 545         }
 546 #undef CLIP_FMV
 547
 548         a->l0.me16x16.cost =
 549         a->l0.i_rd16x16    =
 550         a->l0.i_cost8x8    =
 551         a->l0.i_cost16x8   =
 552         a->l0.i_cost8x16   = COST_MAX;
 553         if( h->sh.i_type == SLICE_TYPE_B )
 554         {
 555             a->l1.me16x16.cost =
 556             a->l1.i_rd16x16    =
 557             a->l1.i_cost8x8    =
 558             a->i_cost8x8direct[0] =
 559             a->i_cost8x8direct[1] =
 560             a->i_cost8x8direct[2] =
 561             a->i_cost8x8direct[3] =
 562             a->l1.i_cost16x8   =
 563             a->l1.i_cost8x16   =
 564             a->i_rd16x16bi     =
 565             a->i_rd16x16direct =
 566             a->i_rd8x8bi       =
 567             a->i_rd16x8bi      =
 568             a->i_rd8x16bi      =
 569             a->i_cost16x16bi   =
 570             a->i_cost16x16direct =
 571             a->i_cost8x8bi     =
 572             a->i_cost16x8bi    =
 573             a->i_cost8x16bi    = COST_MAX;
 574         }
 575         else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
 576             for( int i = 0; i < 4; i++ )
 577             {
 578                 a->l0.i_cost4x4[i] =
 579                 a->l0.i_cost8x4[i] =
 580                 a->l0.i_cost4x8[i] = COST_MAX;
 581             }
 582
 583         /* Fast intra decision */
 584         if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
 585         {
 586             /* Always run in fast-intra mode for subme < 3 */
 587             if( h->mb.i_subpel_refine > 2 &&
 588               ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
 589                 IS_INTRA( h->mb.i_mb_type_top ) ||
 590                 IS_INTRA( h->mb.i_mb_type_topleft ) ||
 591                 IS_INTRA( h->mb.i_mb_type_topright ) ||
 592                 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
 593                 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
 594             { /* intra is likely */ }
 595             else
 596             {
 597                 a->b_fast_intra = 1;
 598             }
 599         }
 600         h->mb.b_skip_mc = 0;
 601         if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
 602             h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
 603         {
 604             a->b_force_intra = 1;
 605             a->b_fast_intra = 0;
 606             a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
 607         }
 608         else
 609             a->b_force_intra = 0;
 610     }
 611 }
 612
 613 /* Prediction modes allowed for various combinations of neighbors. */
 614 /* Terminated by a -1. */
 615 /* In order, no neighbors, left, top, top/left, top/left/topleft */
 616 static const int8_t i16x16_mode_available[5][5] =
 617 {
 618     {I_PRED_16x16_DC_128, -1, -1, -1, -1},
 619     {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
 620     {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
 621     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
 622     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 623 };
 624
 625 static const int8_t chroma_mode_available[5][5] =
 626 {
 627     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
 628     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
 629     {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
 630     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
 631     {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
 632 };
 633
 634 static const int8_t i8x8_mode_available[2][5][10] =
 635 {
 636     {
 637         {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 638         {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 639         {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 640         {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 641         {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 642     },
 643     {
 644         {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 645         {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 646         {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 647         {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
 648         {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 649     }
 650 };
 651
 652 static const int8_t i4x4_mode_available[2][5][10] =
 653 {
 654     {
 655         {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 656         {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 657         {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
 658         {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
 659         {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
 660     },
 661     {
 662         {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
 663         {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
 664         {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
 665         {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
 666         {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
 667     }
 668 };
 669
 670 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
 671 {
 672     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 673     idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
 674     return i16x16_mode_available[idx];
 675 }
 676
 677 static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
 678 {
 679     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 680     idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
 681     return chroma_mode_available[idx];
 682 }
 683
 684 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
 685 {
 686     int avoid_topright = force_intra && (i&1);
 687     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 688     idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
 689     return i8x8_mode_available[avoid_topright][idx];
 690 }
 691
 692 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
 693 {
 694     int avoid_topright = force_intra && ((i&5) == 5);
 695     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
 696     idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
 697     return i4x4_mode_available[avoid_topright][idx];
 698 }
 699
 700 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 701 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 702 {
 703     ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
 704
 705     if( do_both_dct || h->mb.b_transform_8x8 )
 706         h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
 707     if( do_both_dct || !h->mb.b_transform_8x8 )
 708         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 709 }
 710
 711 /* Reset fenc satd scores cache for psy RD */
 712 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 713 {
 714     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
 715         x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
 716     if( !h->mb.i_psy_rd )
 717         return;
 718     /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
 719     h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
 720     if( b_satd )
 721         h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 722 }
 723
 724 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 725 {
 726     if( a->i_satd_chroma < COST_MAX )
 727         return;
 728
 729     if( CHROMA444 )
 730     {
 731         if( !h->mb.b_chroma_me )
 732         {
 733             a->i_satd_chroma = 0;
 734             return;
 735         }
 736
 737         /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
 738         if( h->mb.b_lossless )
 739         {
 740             x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
 741             x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
 742         }
 743         else
 744         {
 745             h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
 746             h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
 747         }
 748         a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
 749                          + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 750         return;
 751     }
 752
 753     const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
 754     int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
 755
 756     /* Prediction selection for chroma */
 757     if( predict_mode[3] >= 0 && !h->mb.b_lossless )
 758     {
 759         int satdu[4], satdv[4];
 760         h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
 761         h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
 762         h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
 763         h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
 764         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
 765         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 766
 767         for( ; *predict_mode >= 0; predict_mode++ )
 768         {
 769             int i_mode = *predict_mode;
 770             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 771
 772             a->i_satd_chroma_dir[i_mode] = i_satd;
 773             COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
 774         }
 775     }
 776     else
 777     {
 778         for( ; *predict_mode >= 0; predict_mode++ )
 779         {
 780             int i_satd;
 781             int i_mode = *predict_mode;
 782
 783             /* we do the prediction */
 784             if( h->mb.b_lossless )
 785                 x264_predict_lossless_chroma( h, i_mode );
 786             else
 787             {
 788                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 789                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 790             }
 791
 792             /* we calculate the cost */
 793             i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
 794                      h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
 795                      a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
 796
 797             a->i_satd_chroma_dir[i_mode] = i_satd;
 798             COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
 799         }
 800     }
 801
 802     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 803 }
 804
 805 /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
 806 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 807 {
 808     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
 809     pixel *p_src = h->mb.pic.p_fenc[0];
 810     pixel *p_dst = h->mb.pic.p_fdec[0];
 811     static const int8_t intra_analysis_shortcut[2][2][2][5] =
 812     {
 813         {{{I_PRED_4x4_HU, -1, -1, -1, -1},
 814           {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
 815          {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
 816           {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
 817         {{{I_PRED_4x4_HU, -1, -1, -1, -1},
 818           {-1, -1, -1, -1, -1}},
 819          {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
 820           {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
 821     };
 822
 823     int idx;
 824     int lambda = a->i_lambda;
 825
 826     /*---------------- Try all mode and calculate their score ---------------*/
 827     /* Disabled i16x16 for AVC-Intra compat */
 828     if( !h->param.i_avcintra_class )
 829     {
 830         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 831
 832         /* Not heavily tuned */
 833         static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
 834         int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
 835
 836         if( !h->mb.b_lossless && predict_mode[3] >= 0 )
 837         {
 838             h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
 839             a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
 840             a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
 841             a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
 842             COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
 843             COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
 844             COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
 845
 846             /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
 847             if( a->i_satd_i16x16 <= i16x16_thresh )
 848             {
 849                 h->predict_16x16[I_PRED_16x16_P]( p_dst );
 850                 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
 851                 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
 852                 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
 853             }
 854         }
 855         else
 856         {
 857             for( ; *predict_mode >= 0; predict_mode++ )
 858             {
 859                 int i_satd;
 860                 int i_mode = *predict_mode;
 861
 862                 if( h->mb.b_lossless )
 863                     x264_predict_lossless_16x16( h, 0, i_mode );
 864                 else
 865                     h->predict_16x16[i_mode]( p_dst );
 866
 867                 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
 868                          lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
 869                 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
 870                 a->i_satd_i16x16_dir[i_mode] = i_satd;
 871             }
 872         }
 873
 874         if( h->sh.i_type == SLICE_TYPE_B )
 875             /* cavlc mb type prefix */
 876             a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
 877
 878         if( a->i_satd_i16x16 > i16x16_thresh )
 879             return;
 880     }
 881
 882     uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
 883     /* 8x8 prediction selection */
 884     if( flags & X264_ANALYSE_I8x8 )
 885     {
 886         ALIGNED_ARRAY_32( pixel, edge,[36] );
 887         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
 888         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
 889
 890         // FIXME some bias like in i4x4?
 891         int i_cost = lambda * 4; /* base predmode costs */
 892         h->mb.i_cbp_luma = 0;
 893
 894         if( h->sh.i_type == SLICE_TYPE_B )
 895             i_cost += lambda * i_mb_b_cost_table[I_8x8];
 896
 897         for( idx = 0;; idx++ )
 898         {
 899             int x = idx&1;
 900             int y = idx>>1;
 901             pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
 902             pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
 903             int i_best = COST_MAX;
 904             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 905
 906             const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
 907             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 908
 909             if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
 910             {
 911                 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
 912                 i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
 913                 i_cost += i_best & 0xffff;
 914                 i_best >>= 16;
 915                 a->i_predict8x8[idx] = i_best;
 916                 if( idx == 3 || i_cost > i_satd_thresh )
 917                     break;
 918                 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
 919             }
 920             else
 921             {
 922                 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
 923                 {
 924                     ALIGNED_ARRAY_16( int32_t, satd,[9] );
 925                     h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 926                     int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
 927                     satd[i_pred_mode] -= 3 * lambda;
 928                     for( int i = 2; i >= 0; i-- )
 929                     {
 930                         int cost = satd[i];
 931                         a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
 932                         COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
 933                     }
 934
 935                     /* Take analysis shortcuts: don't analyse modes that are too
 936                      * far away direction-wise from the favored mode. */
 937                     if( a->i_mbrd < 1 + a->b_fast_intra )
 938                         predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
 939                     else
 940                         predict_mode += 3;
 941                 }
 942
 943                 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
 944                 {
 945                     int i_satd;
 946                     int i_mode = *predict_mode;
 947
 948                     if( h->mb.b_lossless )
 949                         x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
 950                     else
 951                         h->predict_8x8[i_mode]( p_dst_by, edge );
 952
 953                     i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
 954                     if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
 955                         i_satd -= 3 * lambda;
 956
 957                     COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
 958                     a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
 959                 }
 960                 i_cost += i_best + 3*lambda;
 961
 962                 if( idx == 3 || i_cost > i_satd_thresh )
 963                     break;
 964                 if( h->mb.b_lossless )
 965                     x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
 966                 else
 967                     h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
 968                 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
 969             }
 970             /* we need to encode this block now (for next ones) */
 971             x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
 972         }
 973
 974         if( idx == 3 )
 975         {
 976             a->i_satd_i8x8 = i_cost;
 977             if( h->mb.i_skip_intra )
 978             {
 979                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
 980                 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
 981                 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
 982                 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
 983                 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
 984                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
 985                 if( h->mb.i_skip_intra == 2 )
 986                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
 987             }
 988         }
 989         else
 990         {
 991             static const uint16_t cost_div_fix8[3] = {1024,512,341};
 992             a->i_satd_i8x8 = COST_MAX;
 993             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
 994         }
 995         /* Not heavily tuned */
 996         static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
 997         if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
 998             return;
 999     }
1000
1001     /* 4x4 prediction selection */
1002     if( flags & X264_ANALYSE_I4x4 )
1003     {
1004         int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
1005         int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
1006         h->mb.i_cbp_luma = 0;
1007
1008         if( a->b_early_terminate && a->i_mbrd )
1009             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
1010
1011         if( h->sh.i_type == SLICE_TYPE_B )
1012             i_cost += lambda * i_mb_b_cost_table[I_4x4];
1013
1014         for( idx = 0;; idx++ )
1015         {
1016             pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1017             pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1018             int i_best = COST_MAX;
1019             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1020
1021             const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1022
1023             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1024                 /* emulate missing topright samples */
1025                 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1026
1027             if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1028             {
1029                 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1030                 i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1031                 i_cost += i_best & 0xffff;
1032                 i_best >>= 16;
1033                 a->i_predict4x4[idx] = i_best;
1034                 if( i_cost > i_satd_thresh || idx == 15 )
1035                     break;
1036                 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1037             }
1038             else
1039             {
1040                 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1041                 {
1042                     ALIGNED_ARRAY_16( int32_t, satd,[9] );
1043                     h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1044                     int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1045                     satd[i_pred_mode] -= 3 * lambda;
1046                     i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1047                     COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1048                     COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1049
1050                     /* Take analysis shortcuts: don't analyse modes that are too
1051                      * far away direction-wise from the favored mode. */
1052                     if( a->i_mbrd < 1 + a->b_fast_intra )
1053                         predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1054                     else
1055                         predict_mode += 3;
1056                 }
1057
1058                 if( i_best > 0 )
1059                 {
1060                     for( ; *predict_mode >= 0; predict_mode++ )
1061                     {
1062                         int i_satd;
1063                         int i_mode = *predict_mode;
1064
1065                         if( h->mb.b_lossless )
1066                             x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1067                         else
1068                             h->predict_4x4[i_mode]( p_dst_by );
1069
1070                         i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1071                         if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1072                         {
1073                             i_satd -= lambda * 3;
1074                             if( i_satd <= 0 )
1075                             {
1076                                 i_best = i_satd;
1077                                 a->i_predict4x4[idx] = i_mode;
1078                                 break;
1079                             }
1080                         }
1081
1082                         COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1083                     }
1084                 }
1085
1086                 i_cost += i_best + 3 * lambda;
1087                 if( i_cost > i_satd_thresh || idx == 15 )
1088                     break;
1089                 if( h->mb.b_lossless )
1090                     x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1091                 else
1092                     h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1093                 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1094             }
1095             /* we need to encode this block now (for next ones) */
1096             x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1097         }
1098         if( idx == 15 )
1099         {
1100             a->i_satd_i4x4 = i_cost;
1101             if( h->mb.i_skip_intra )
1102             {
1103                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1104                 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1105                 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1106                 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1107                 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1108                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1109                 if( h->mb.i_skip_intra == 2 )
1110                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1111             }
1112         }
1113         else
1114             a->i_satd_i4x4 = COST_MAX;
1115     }
1116 }
1117
1118 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1119 {
1120     if( !a->b_early_terminate )
1121         i_satd_thresh = COST_MAX;
1122
1123     if( a->i_satd_i16x16 < i_satd_thresh )
1124     {
1125         h->mb.i_type = I_16x16;
1126         x264_analyse_update_cache( h, a );
1127         a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1128     }
1129     else
1130         a->i_satd_i16x16 = COST_MAX;
1131
1132     if( a->i_satd_i4x4 < i_satd_thresh )
1133     {
1134         h->mb.i_type = I_4x4;
1135         x264_analyse_update_cache( h, a );
1136         a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1137     }
1138     else
1139         a->i_satd_i4x4 = COST_MAX;
1140
1141     if( a->i_satd_i8x8 < i_satd_thresh )
1142     {
1143         h->mb.i_type = I_8x8;
1144         x264_analyse_update_cache( h, a );
1145         a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1146         a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1147     }
1148     else
1149         a->i_satd_i8x8 = COST_MAX;
1150 }
1151
1152 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1153 {
1154     uint64_t i_satd, i_best;
1155     int plane_count = CHROMA444 ? 3 : 1;
1156     h->mb.i_skip_intra = 0;
1157
1158     if( h->mb.i_type == I_16x16 )
1159     {
1160         int old_pred_mode = a->i_predict16x16;
1161         const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1162         int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1163         i_best = a->i_satd_i16x16;
1164         for( ; *predict_mode >= 0; predict_mode++ )
1165         {
1166             int i_mode = *predict_mode;
1167             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1168                 continue;
1169             h->mb.i_intra16x16_pred_mode = i_mode;
1170             i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1171             COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1172         }
1173     }
1174
1175     /* RD selection for chroma prediction */
1176     if( !CHROMA444 )
1177     {
1178         const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1179         if( predict_mode[1] >= 0 )
1180         {
1181             int8_t predict_mode_sorted[4];
1182             int i_max;
1183             int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1184
1185             for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1186             {
1187                 int i_mode = *predict_mode;
1188                 if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1189                     predict_mode_sorted[i_max++] = i_mode;
1190             }
1191
1192             if( i_max > 0 )
1193             {
1194                 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1195                 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1196                 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1197                  * coefs for the current chroma mode are still around, so we only
1198                  * have to recount the bits. */
1199                 i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1200                 for( int i = 0; i < i_max; i++ )
1201                 {
1202                     int i_mode = predict_mode_sorted[i];
1203                     if( h->mb.b_lossless )
1204                         x264_predict_lossless_chroma( h, i_mode );
1205                     else
1206                     {
1207                         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1208                         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1209                     }
1210                     /* if we've already found a mode that needs no residual, then
1211                      * probably any mode with a residual will be worse.
1212                      * so avoid dct on the remaining modes to improve speed. */
1213                     i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1214                     COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1215                 }
1216                 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1217                 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1218             }
1219         }
1220     }
1221
1222     if( h->mb.i_type == I_4x4 )
1223     {
1224         pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1225         int nnz[3] = {0};
1226         for( int idx = 0; idx < 16; idx++ )
1227         {
1228             pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1229                              h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1230                              h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1231             i_best = COST_MAX64;
1232
1233             const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1234
1235             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1236                 for( int p = 0; p < plane_count; p++ )
1237                     /* emulate missing topright samples */
1238                     MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1239
1240             for( ; *predict_mode >= 0; predict_mode++ )
1241             {
1242                 int i_mode = *predict_mode;
1243                 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1244
1245                 if( i_best > i_satd )
1246                 {
1247                     a->i_predict4x4[idx] = i_mode;
1248                     i_best = i_satd;
1249                     for( int p = 0; p < plane_count; p++ )
1250                     {
1251                         pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1252                         pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1253                         pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1254                         pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1255                         nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1256                     }
1257                 }
1258             }
1259
1260             for( int p = 0; p < plane_count; p++ )
1261             {
1262                 MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1263                 MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1264                 MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1265                 MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1266                 h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1267             }
1268
1269             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1270         }
1271     }
1272     else if( h->mb.i_type == I_8x8 )
1273     {
1274         ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1275         pixel4 pels_h[3][2] = {{0}};
1276         pixel pels_v[3][7] = {{0}};
1277         uint16_t nnz[3][2] = {{0}}; //shut up gcc
1278         for( int idx = 0; idx < 4; idx++ )
1279         {
1280             int x = idx&1;
1281             int y = idx>>1;
1282             int s8 = X264_SCAN8_0 + 2*x + 16*y;
1283             pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1284                              h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1285                              h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1286             int cbp_luma_new = 0;
1287             int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1288
1289             i_best = COST_MAX64;
1290
1291             const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1292             for( int p = 0; p < plane_count; p++ )
1293                 h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1294
1295             for( ; *predict_mode >= 0; predict_mode++ )
1296             {
1297                 int i_mode = *predict_mode;
1298                 if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1299                     continue;
1300
1301                 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1302                 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1303
1304                 if( i_best > i_satd )
1305                 {
1306                     a->i_predict8x8[idx] = i_mode;
1307                     cbp_luma_new = h->mb.i_cbp_luma;
1308                     i_best = i_satd;
1309
1310                     for( int p = 0; p < plane_count; p++ )
1311                     {
1312                         pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1313                         pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1314                         if( !(idx&1) )
1315                             for( int j = 0; j < 7; j++ )
1316                                 pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1317                         nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1318                         nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1319                     }
1320                 }
1321             }
1322             a->i_cbp_i8x8_luma = cbp_luma_new;
1323             for( int p = 0; p < plane_count; p++ )
1324             {
1325                 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1326                 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1327                 if( !(idx&1) )
1328                     for( int j = 0; j < 7; j++ )
1329                         dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1330                 M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1331                 M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1332             }
1333
1334             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1335         }
1336     }
1337 }
1338
1339 #define LOAD_FENC(m, src, xoff, yoff) \
1340 { \
1341     (m)->p_cost_mv = a->p_cost_mv; \
1342     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1343     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1344     (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1345     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1346     (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1347     (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1348 }
1349
1350 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1351 { \
1352     (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1353     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1354     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1355     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1356     if( CHROMA444 ) \
1357     { \
1358         (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1359         (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1360         (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1361         (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1362         (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1363         (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1364         (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1365         (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1366     } \
1367     else \
1368         (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1369     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1370     (m)->weight = x264_weight_none; \
1371     (m)->i_ref = ref; \
1372 }
1373
1374 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1375     (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1376     (m)->weight = h->sh.weight[i_ref];
1377
1378 #define REF_COST(list, ref) \
1379     (a->p_cost_ref[list][ref])
1380
1381 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1382 {
1383     x264_me_t m;
1384     int i_mvc;
1385     ALIGNED_4( int16_t mvc[8][2] );
1386     int i_halfpel_thresh = INT_MAX;
1387     int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1388
1389     /* 16x16 Search on all ref frame */
1390     m.i_pixel = PIXEL_16x16;
1391     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1392
1393     a->l0.me16x16.cost = INT_MAX;
1394     for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1395     {
1396         m.i_ref_cost = REF_COST( 0, i_ref );
1397         i_halfpel_thresh -= m.i_ref_cost;
1398
1399         /* search with ref */
1400         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1401         LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1402
1403         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1404
1405         if( h->mb.ref_blind_dupe == i_ref )
1406         {
1407             CP32( m.mv, a->l0.mvc[0][0] );
1408             x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1409         }
1410         else
1411         {
1412             x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1413             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1414         }
1415
1416         /* save mv for predicting neighbors */
1417         CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1418         CP32( a->l0.mvc[i_ref][0], m.mv );
1419
1420         /* early termination
1421          * SSD threshold would probably be better than SATD */
1422         if( i_ref == 0
1423             && a->b_try_skip
1424             && m.cost-m.cost_mv < 300*a->i_lambda
1425             &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1426               + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1427             && x264_macroblock_probe_pskip( h ) )
1428         {
1429             h->mb.i_type = P_SKIP;
1430             x264_analyse_update_cache( h, a );
1431             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1432             return;
1433         }
1434
1435         m.cost += m.i_ref_cost;
1436         i_halfpel_thresh += m.i_ref_cost;
1437
1438         if( m.cost < a->l0.me16x16.cost )
1439             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1440     }
1441
1442     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1443     assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1444
1445     h->mb.i_type = P_L0;
1446     if( a->i_mbrd )
1447     {
1448         x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1449         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1450         {
1451             h->mb.i_partition = D_16x16;
1452             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1453             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1454             if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1455                 h->mb.i_type = P_SKIP;
1456         }
1457     }
1458 }
1459
1460 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1461 {
1462     x264_me_t m;
1463     pixel **p_fenc = h->mb.pic.p_fenc;
1464     int i_maxref = h->mb.pic.i_fref[0]-1;
1465
1466     h->mb.i_partition = D_8x8;
1467
1468     #define CHECK_NEIGHBOUR(i)\
1469     {\
1470         int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1471         if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1472             i_maxref = ref;\
1473     }
1474
1475     /* early termination: if 16x16 chose ref 0, then evalute no refs older
1476      * than those used by the neighbors */
1477     if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1478         h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1479     {
1480         i_maxref = 0;
1481         CHECK_NEIGHBOUR(  -8 - 1 );
1482         CHECK_NEIGHBOUR(  -8 + 0 );
1483         CHECK_NEIGHBOUR(  -8 + 2 );
1484         CHECK_NEIGHBOUR(  -8 + 4 );
1485         CHECK_NEIGHBOUR(   0 - 1 );
1486         CHECK_NEIGHBOUR( 2*8 - 1 );
1487     }
1488     #undef CHECK_NEIGHBOUR
1489
1490     for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1491         CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1492
1493     for( int i = 0; i < 4; i++ )
1494     {
1495         x264_me_t *l0m = &a->l0.me8x8[i];
1496         int x8 = i&1;
1497         int y8 = i>>1;
1498
1499         m.i_pixel = PIXEL_8x8;
1500
1501         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1502         l0m->cost = INT_MAX;
1503         for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1504         {
1505             m.i_ref_cost = REF_COST( 0, i_ref );
1506
1507             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1508             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1509
1510             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1511             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1512             if( h->mb.ref_blind_dupe == i_ref )
1513             {
1514                 CP32( m.mv, a->l0.mvc[0][i+1] );
1515                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1516             }
1517             else
1518                 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1519
1520             m.cost += m.i_ref_cost;
1521
1522             CP32( a->l0.mvc[i_ref][i+1], m.mv );
1523
1524             if( m.cost < l0m->cost )
1525                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1526             if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1527                 i_ref = h->mb.ref_blind_dupe;
1528             else
1529                 i_ref++;
1530         }
1531         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1532         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1533
1534         a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1535
1536         /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1537            are effectively zero. */
1538         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1539             l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1540     }
1541
1542     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1543                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1544     /* P_8x8 ref0 has no ref cost */
1545     if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1546                                a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1547         a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1548     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1549     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1550 }
1551
1552 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1553 {
1554     /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1555      * reference frame flags.  Thus, if we're not doing mixedrefs, just
1556      * don't bother analysing the dupes. */
1557     const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1558     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1559     pixel **p_fenc = h->mb.pic.p_fenc;
1560     int i_mvc;
1561     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1562
1563     /* XXX Needed for x264_mb_predict_mv */
1564     h->mb.i_partition = D_8x8;
1565
1566     i_mvc = 1;
1567     CP32( mvc[0], a->l0.me16x16.mv );
1568
1569     for( int i = 0; i < 4; i++ )
1570     {
1571         x264_me_t *m = &a->l0.me8x8[i];
1572         int x8 = i&1;
1573         int y8 = i>>1;
1574
1575         m->i_pixel = PIXEL_8x8;
1576         m->i_ref_cost = i_ref_cost;
1577
1578         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1579         LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1580         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1581
1582         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1583         x264_me_search( h, m, mvc, i_mvc );
1584
1585         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1586
1587         CP32( mvc[i_mvc], m->mv );
1588         i_mvc++;
1589
1590         a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1591
1592         /* mb type cost */
1593         m->cost += i_ref_cost;
1594         if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1595             m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1596     }
1597
1598     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1599                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1600     /* theoretically this should include 4*ref_cost,
1601      * but 3 seems a better approximation of cabac. */
1602     if( h->param.b_cabac )
1603         a->l0.i_cost8x8 -= i_ref_cost;
1604     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1605     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1606 }
1607
1608 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1609 {
1610     x264_me_t m;
1611     pixel **p_fenc = h->mb.pic.p_fenc;
1612     ALIGNED_4( int16_t mvc[3][2] );
1613
1614     /* XXX Needed for x264_mb_predict_mv */
1615     h->mb.i_partition = D_16x8;
1616
1617     for( int i = 0; i < 2; i++ )
1618     {
1619         x264_me_t *l0m = &a->l0.me16x8[i];
1620         const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1621         const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1622         const int ref8[2] = { minref, maxref };
1623         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1624
1625         m.i_pixel = PIXEL_16x8;
1626
1627         LOAD_FENC( &m, p_fenc, 0, 8*i );
1628         l0m->cost = INT_MAX;
1629         for( int j = 0; j < i_ref8s; j++ )
1630         {
1631             const int i_ref = ref8[j];
1632             m.i_ref_cost = REF_COST( 0, i_ref );
1633
1634             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1635             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1636             CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1637             CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1638
1639             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1640             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1641
1642             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1643             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1644             /* We can only take this shortcut if the first search was performed on ref0. */
1645             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1646             {
1647                 /* We can just leave the MV from the previous ref search. */
1648                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1649             }
1650             else
1651                 x264_me_search( h, &m, mvc, 3 );
1652
1653             m.cost += m.i_ref_cost;
1654
1655             if( m.cost < l0m->cost )
1656                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1657         }
1658
1659         /* Early termination based on the current SATD score of partition[0]
1660            plus the estimated SATD score of partition[1] */
1661         if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1662         {
1663             a->l0.i_cost16x8 = COST_MAX;
1664             return;
1665         }
1666
1667         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1668         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1669     }
1670
1671     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1672 }
1673
1674 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1675 {
1676     x264_me_t m;
1677     pixel **p_fenc = h->mb.pic.p_fenc;
1678     ALIGNED_4( int16_t mvc[3][2] );
1679
1680     /* XXX Needed for x264_mb_predict_mv */
1681     h->mb.i_partition = D_8x16;
1682
1683     for( int i = 0; i < 2; i++ )
1684     {
1685         x264_me_t *l0m = &a->l0.me8x16[i];
1686         const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1687         const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1688         const int ref8[2] = { minref, maxref };
1689         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1690
1691         m.i_pixel = PIXEL_8x16;
1692
1693         LOAD_FENC( &m, p_fenc, 8*i, 0 );
1694         l0m->cost = INT_MAX;
1695         for( int j = 0; j < i_ref8s; j++ )
1696         {
1697             const int i_ref = ref8[j];
1698             m.i_ref_cost = REF_COST( 0, i_ref );
1699
1700             CP32( mvc[0], a->l0.mvc[i_ref][0] );
1701             CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1702             CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1703
1704             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1705             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1706
1707             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1708             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1709             /* We can only take this shortcut if the first search was performed on ref0. */
1710             if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1711             {
1712                 /* We can just leave the MV from the previous ref search. */
1713                 x264_me_refine_qpel_refdupe( h, &m, NULL );
1714             }
1715             else
1716                 x264_me_search( h, &m, mvc, 3 );
1717
1718             m.cost += m.i_ref_cost;
1719
1720             if( m.cost < l0m->cost )
1721                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1722         }
1723
1724         /* Early termination based on the current SATD score of partition[0]
1725            plus the estimated SATD score of partition[1] */
1726         if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1727         {
1728             a->l0.i_cost8x16 = COST_MAX;
1729             return;
1730         }
1731
1732         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1733         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1734     }
1735
1736     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1737 }
1738
1739 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1740                                                                      pixel **p_fref, int i8x8, int size, int chroma )
1741 {
1742     ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1743     pixel *pix2 = pix1+8;
1744     int i_stride = h->mb.pic.i_stride[1];
1745     int chroma_h_shift = chroma <= CHROMA_422;
1746     int chroma_v_shift = chroma == CHROMA_420;
1747     int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1748     int i_ref = a->l0.me8x8[i8x8].i_ref;
1749     int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1750     x264_weight_t *weight = h->sh.weight[i_ref];
1751
1752     // FIXME weight can be done on 4x4 blocks even if mc is smaller
1753 #define CHROMA4x4MC( width, height, me, x, y ) \
1754     if( chroma == CHROMA_444 ) \
1755     { \
1756         int mvx = (me).mv[0] + 4*2*x; \
1757         int mvy = (me).mv[1] + 4*2*y; \
1758         h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1759                        mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1760         h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1761                        mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1762     } \
1763     else \
1764     { \
1765         int offset = x + (2>>chroma_v_shift)*16*y; \
1766         int chroma_height = (2>>chroma_v_shift)*height; \
1767         h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1768                          (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1769         if( weight[1].weightfn ) \
1770             weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1771         if( weight[2].weightfn ) \
1772             weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1773     }
1774
1775     if( size == PIXEL_4x4 )
1776     {
1777         x264_me_t *m = a->l0.me4x4[i8x8];
1778         CHROMA4x4MC( 2,2, m[0], 0,0 );
1779         CHROMA4x4MC( 2,2, m[1], 2,0 );
1780         CHROMA4x4MC( 2,2, m[2], 0,2 );
1781         CHROMA4x4MC( 2,2, m[3], 2,2 );
1782     }
1783     else if( size == PIXEL_8x4 )
1784     {
1785         x264_me_t *m = a->l0.me8x4[i8x8];
1786         CHROMA4x4MC( 4,2, m[0], 0,0 );
1787         CHROMA4x4MC( 4,2, m[1], 0,2 );
1788     }
1789     else
1790     {
1791         x264_me_t *m = a->l0.me4x8[i8x8];
1792         CHROMA4x4MC( 2,4, m[0], 0,0 );
1793         CHROMA4x4MC( 2,4, m[1], 2,0 );
1794     }
1795 #undef CHROMA4x4MC
1796
1797     int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1798     int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1799     return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1800          + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1801 }
1802
1803 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1804 {
1805     if( CHROMA_FORMAT == CHROMA_444 )
1806         return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1807     else if( CHROMA_FORMAT == CHROMA_422 )
1808         return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1809     else
1810         return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1811 }
1812
1813 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1814 {
1815     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1816     pixel **p_fenc = h->mb.pic.p_fenc;
1817     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1818
1819     /* XXX Needed for x264_mb_predict_mv */
1820     h->mb.i_partition = D_8x8;
1821
1822     for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1823     {
1824         const int idx = 4*i8x8 + i4x4;
1825         const int x4 = block_idx_x[idx];
1826         const int y4 = block_idx_y[idx];
1827         const int i_mvc = (i4x4 == 0);
1828
1829         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1830
1831         m->i_pixel = PIXEL_4x4;
1832
1833         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1834         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1835         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1836
1837         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1838         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1839
1840         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1841     }
1842     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1843                             a->l0.me4x4[i8x8][1].cost +
1844                             a->l0.me4x4[i8x8][2].cost +
1845                             a->l0.me4x4[i8x8][3].cost +
1846                             REF_COST( 0, i_ref ) +
1847                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1848     if( h->mb.b_chroma_me )
1849         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1850 }
1851
1852 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1853 {
1854     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1855     pixel **p_fenc = h->mb.pic.p_fenc;
1856     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1857
1858     /* XXX Needed for x264_mb_predict_mv */
1859     h->mb.i_partition = D_8x8;
1860
1861     for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1862     {
1863         const int idx = 4*i8x8 + 2*i8x4;
1864         const int x4 = block_idx_x[idx];
1865         const int y4 = block_idx_y[idx];
1866         const int i_mvc = (i8x4 == 0);
1867
1868         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1869
1870         m->i_pixel = PIXEL_8x4;
1871
1872         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1873         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1874         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1875
1876         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1877         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1878
1879         x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1880     }
1881     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1882                             REF_COST( 0, i_ref ) +
1883                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1884     if( h->mb.b_chroma_me )
1885         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1886 }
1887
1888 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1889 {
1890     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1891     pixel **p_fenc = h->mb.pic.p_fenc;
1892     const int i_ref = a->l0.me8x8[i8x8].i_ref;
1893
1894     /* XXX Needed for x264_mb_predict_mv */
1895     h->mb.i_partition = D_8x8;
1896
1897     for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1898     {
1899         const int idx = 4*i8x8 + i4x8;
1900         const int x4 = block_idx_x[idx];
1901         const int y4 = block_idx_y[idx];
1902         const int i_mvc = (i4x8 == 0);
1903
1904         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1905
1906         m->i_pixel = PIXEL_4x8;
1907
1908         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1909         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1910         LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1911
1912         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1913         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1914
1915         x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1916     }
1917     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1918                             REF_COST( 0, i_ref ) +
1919                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1920     if( h->mb.b_chroma_me )
1921         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1922 }
1923
1924 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1925 {
1926     ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1927     ALIGNED_ARRAY_N( pixel,  bi, [2],[16*16] );
1928     int i_chroma_cost = 0;
1929     int chromapix = h->luma2chroma_pixel[i_pixel];
1930
1931 #define COST_BI_CHROMA( m0, m1, width, height ) \
1932 { \
1933     if( CHROMA444 ) \
1934     { \
1935         h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1936                        m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1937         h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1938                        m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1939         h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1940                        m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1941         h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1942                        m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1943     } \
1944     else \
1945     { \
1946         int v_shift = CHROMA_V_SHIFT; \
1947         int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1948         int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1949         h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1950                          m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1951         h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1952                          m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1953     } \
1954     h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1955     h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1956     i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1957                   + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1958 }
1959
1960     if( i_pixel == PIXEL_16x16 )
1961         COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1962     else if( i_pixel == PIXEL_16x8 )
1963         COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1964     else if( i_pixel == PIXEL_8x16 )
1965         COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1966     else
1967         COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1968
1969     return i_chroma_cost;
1970 }
1971
1972 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1973 {
1974     /* Assumes that fdec still contains the results of
1975      * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1976
1977     pixel *p_fenc = h->mb.pic.p_fenc[0];
1978     pixel *p_fdec = h->mb.pic.p_fdec[0];
1979
1980     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1981     if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1982     {
1983         int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1984
1985         for( int i = 0; i < 4; i++ )
1986         {
1987             const int x = (i&1)*8;
1988             const int y = (i>>1)*8;
1989             a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1990                                                               &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1991             if( h->mb.b_chroma_me )
1992             {
1993                 int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1994                 int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1995                 a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1996                                                                    &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1997                                        + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1998                                                                    &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1999             }
2000             a->i_cost16x16direct += a->i_cost8x8direct[i];
2001
2002             /* mb type cost */
2003             a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
2004         }
2005     }
2006     else
2007     {
2008         a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
2009         if( h->mb.b_chroma_me )
2010         {
2011             int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2012             a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2013                                  +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2014         }
2015     }
2016 }
2017
2018 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2019 {
2020     ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2021     ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2022     pixel *src0, *src1;
2023     intptr_t stride0 = 16, stride1 = 16;
2024     int i_ref, i_mvc;
2025     ALIGNED_4( int16_t mvc[9][2] );
2026     int try_skip = a->b_try_skip;
2027     int list1_skipped = 0;
2028     int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2029     int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2030                                 (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2031
2032     x264_me_t m;
2033     m.i_pixel = PIXEL_16x16;
2034
2035     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2036
2037     /* 16x16 Search on list 0 and list 1 */
2038     a->l0.me16x16.cost = INT_MAX;
2039     a->l1.me16x16.cost = INT_MAX;
2040     for( int l = 1; l >= 0; )
2041     {
2042         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2043
2044         /* This loop is extremely munged in order to facilitate the following order of operations,
2045          * necessary for an efficient fast skip.
2046          * 1.  Search list1 ref0.
2047          * 2.  Search list0 ref0.
2048          * 3.  Try skip.
2049          * 4.  Search the rest of list0.
2050          * 5.  Go back and finish list1.
2051          */
2052         for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2053         {
2054             if( try_skip && l == 1 && i_ref > 0 )
2055             {
2056                 list1_skipped = 1;
2057                 break;
2058             }
2059
2060             m.i_ref_cost = REF_COST( l, i_ref );
2061
2062             /* search with ref */
2063             LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2064             x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2065             x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2066             x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2067
2068             /* add ref cost */
2069             m.cost += m.i_ref_cost;
2070
2071             if( m.cost < lX->me16x16.cost )
2072                 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2073
2074             /* save mv for predicting neighbors */
2075             CP32( lX->mvc[i_ref][0], m.mv );
2076             CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2077
2078             /* Fast skip detection. */
2079             if( i_ref == 0 && try_skip )
2080             {
2081                 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2082                     abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2083                 {
2084                     try_skip = 0;
2085                 }
2086                 else if( !l )
2087                 {
2088                     /* We already tested skip */
2089                     h->mb.i_type = B_SKIP;
2090                     x264_analyse_update_cache( h, a );
2091                     return;
2092                 }
2093             }
2094         }
2095         if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2096             break;
2097         if( list1_skipped && l == 0 )
2098             l = 1;
2099         else
2100             l--;
2101     }
2102
2103     /* get cost of BI mode */
2104     h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2105     h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2106     int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2107     src0 = h->mc.get_ref( pix0, &stride0,
2108                           h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2109                           a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2110     src1 = h->mc.get_ref( pix1, &stride1,
2111                           h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2112                           a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2113
2114     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2115
2116     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2117                      + ref_costs
2118                      + a->l0.bi16x16.cost_mv
2119                      + a->l1.bi16x16.cost_mv;
2120
2121     if( h->mb.b_chroma_me )
2122         a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2123
2124     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2125     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2126     {
2127         int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2128                        + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2129         int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2130                        + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2131         h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2132                                 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2133                                 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2134         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2135                    + ref_costs + l0_mv_cost + l1_mv_cost;
2136
2137         if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2138         {
2139             ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2140
2141             if( CHROMA444 )
2142             {
2143                 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2144                                         h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2145                                         h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2146                 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2147                 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2148                                         h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2149                                         h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2150                 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2151             }
2152             else
2153             {
2154                 ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2155                 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2156                 int v_shift = CHROMA_V_SHIFT;
2157
2158                 if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2159                 {
2160                     int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2161                     h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2162                                      h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2163                 }
2164                 else
2165                     h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2166                                                          h->mb.pic.i_stride[1], 16>>v_shift );
2167
2168                 if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2169                 {
2170                     int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2171                     h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2172                                      h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2173                 }
2174                 else
2175                     h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2176                                                          h->mb.pic.i_stride[1], 16>>v_shift );
2177
2178                 h->mc.avg[chromapix]( bi,   FENC_STRIDE, pixuv[0],   FENC_STRIDE, pixuv[1],   FENC_STRIDE,
2179                                       h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2180                 h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2181                                       h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2182
2183                 cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi,   FENC_STRIDE )
2184                        +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2185             }
2186         }
2187
2188         if( cost00 < a->i_cost16x16bi )
2189         {
2190             M32( a->l0.bi16x16.mv ) = 0;
2191             M32( a->l1.bi16x16.mv ) = 0;
2192             a->l0.bi16x16.cost_mv = l0_mv_cost;
2193             a->l1.bi16x16.cost_mv = l1_mv_cost;
2194             a->i_cost16x16bi = cost00;
2195         }
2196     }
2197
2198     /* mb type cost */
2199     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2200     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2201     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2202 }
2203
2204 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2205 {
2206     int x = 2*(i&1);
2207     int y = i&2;
2208
2209     switch( h->mb.i_sub_partition[i] )
2210     {
2211         case D_L0_8x8:
2212             x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2213             break;
2214         case D_L0_8x4:
2215             x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2216             x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2217             break;
2218         case D_L0_4x8:
2219             x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2220             x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2221             break;
2222         case D_L0_4x4:
2223             x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2224             x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2225             x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2226             x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2227             break;
2228         default:
2229             x264_log( h, X264_LOG_ERROR, "internal error\n" );
2230             break;
2231     }
2232 }
2233
2234 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2235 {
2236     int x = 2*(idx&1);
2237     int y = idx&2;
2238     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2239     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2240     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2241     x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2242 }
2243
2244 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2245     if( x264_mb_partition_listX_table[0][part] ) \
2246     { \
2247         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2248         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2249     } \
2250     else \
2251     { \
2252         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2253         x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
2254         if( b_mvd ) \
2255             x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2256     } \
2257     if( x264_mb_partition_listX_table[1][part] ) \
2258     { \
2259         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2260         x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2261     } \
2262     else \
2263     { \
2264         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2265         x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
2266         if( b_mvd ) \
2267             x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2268     }
2269
2270 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2271 {
2272     int x = 2*(i&1);
2273     int y = i&2;
2274     if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2275     {
2276         x264_mb_load_mv_direct8x8( h, i );
2277         if( b_mvd )
2278         {
2279             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
2280             x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
2281             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2282         }
2283     }
2284     else
2285     {
2286         CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2287     }
2288 }
2289 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2290 {
2291     CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2292 }
2293 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2294 {
2295     CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2296 }
2297 #undef CACHE_MV_BI
2298
2299 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2300 {
2301     ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2302     int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2303
2304     /* early termination: if 16x16 chose ref 0, then evalute no refs older
2305      * than those used by the neighbors */
2306     #define CHECK_NEIGHBOUR(i)\
2307     {\
2308         int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2309         if( ref > i_maxref[l] )\
2310             i_maxref[l] = ref;\
2311     }
2312
2313     for( int l = 0; l < 2; l++ )
2314     {
2315         x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2316         if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2317             h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2318         {
2319             i_maxref[l] = 0;
2320             CHECK_NEIGHBOUR(  -8 - 1 );
2321             CHECK_NEIGHBOUR(  -8 + 0 );
2322             CHECK_NEIGHBOUR(  -8 + 2 );
2323             CHECK_NEIGHBOUR(  -8 + 4 );
2324             CHECK_NEIGHBOUR(   0 - 1 );
2325             CHECK_NEIGHBOUR( 2*8 - 1 );
2326         }
2327     }
2328
2329     /* XXX Needed for x264_mb_predict_mv */
2330     h->mb.i_partition = D_8x8;
2331
2332     a->i_cost8x8bi = 0;
2333
2334     for( int i = 0; i < 4; i++ )
2335     {
2336         int x8 = i&1;
2337         int y8 = i>>1;
2338         int i_part_cost;
2339         int i_part_cost_bi;
2340         intptr_t stride[2] = {8,8};
2341         pixel *src[2];
2342         x264_me_t m;
2343         m.i_pixel = PIXEL_8x8;
2344         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2345
2346         for( int l = 0; l < 2; l++ )
2347         {
2348             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2349
2350             lX->me8x8[i].cost = INT_MAX;
2351             for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2352             {
2353                 m.i_ref_cost = REF_COST( l, i_ref );
2354
2355                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2356
2357                 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2358                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2359                 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2360                 m.cost += m.i_ref_cost;
2361
2362                 if( m.cost < lX->me8x8[i].cost )
2363                 {
2364                     h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2365                     a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2366                 }
2367
2368                 /* save mv for predicting other partitions within this MB */
2369                 CP32( lX->mvc[i_ref][i+1], m.mv );
2370             }
2371         }
2372
2373         /* BI mode */
2374         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2375                                 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2376         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2377                                 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2378         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2379                                 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2380
2381         a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2382         i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2383                          + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2384                          + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2385
2386         if( h->mb.b_chroma_me )
2387         {
2388             int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2389             i_part_cost_bi += i_chroma_cost;
2390             a->i_satd8x8[2][i] += i_chroma_cost;
2391         }
2392
2393         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2394         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2395
2396         i_part_cost = a->l0.me8x8[i].cost;
2397         h->mb.i_sub_partition[i] = D_L0_8x8;
2398         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2399         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2400         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2401         a->i_cost8x8bi += i_part_cost;
2402
2403         /* XXX Needed for x264_mb_predict_mv */
2404         x264_mb_cache_mv_b8x8( h, a, i, 0 );
2405     }
2406
2407     /* mb type cost */
2408     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2409 }
2410
2411 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2412 {
2413     pixel **p_fref[2] =
2414         { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2415           h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2416     ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2417
2418     /* XXX Needed for x264_mb_predict_mv */
2419     h->mb.i_partition = D_8x8;
2420
2421     a->i_cost8x8bi = 0;
2422
2423     for( int i = 0; i < 4; i++ )
2424     {
2425         int x8 = i&1;
2426         int y8 = i>>1;
2427         int i_part_cost;
2428         int i_part_cost_bi = 0;
2429         intptr_t stride[2] = {8,8};
2430         pixel *src[2];
2431
2432         for( int l = 0; l < 2; l++ )
2433         {
2434             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2435             x264_me_t *m = &lX->me8x8[i];
2436             m->i_pixel = PIXEL_8x8;
2437             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2438
2439             m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2440             m->i_ref = lX->me16x16.i_ref;
2441
2442             LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2443
2444             x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2445             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2446             x264_me_search( h, m, &lX->me16x16.mv, 1 );
2447             a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2448             m->cost += m->i_ref_cost;
2449
2450             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2451
2452             /* save mv for predicting other partitions within this MB */
2453             CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2454
2455             /* BI mode */
2456             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2457                                     m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2458             i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2459         }
2460         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2461         a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2462         i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2463         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2464         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2465
2466         if( h->mb.b_chroma_me )
2467         {
2468             int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2469             i_part_cost_bi += i_chroma_cost;
2470             a->i_satd8x8[2][i] += i_chroma_cost;
2471         }
2472
2473         i_part_cost = a->l0.me8x8[i].cost;
2474         h->mb.i_sub_partition[i] = D_L0_8x8;
2475         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2476         COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2477         COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2478         a->i_cost8x8bi += i_part_cost;
2479
2480         /* XXX Needed for x264_mb_predict_mv */
2481         x264_mb_cache_mv_b8x8( h, a, i, 0 );
2482     }
2483
2484     /* mb type cost */
2485     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2486 }
2487
2488 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2489 {
2490     ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2491     ALIGNED_4( int16_t mvc[3][2] );
2492
2493     h->mb.i_partition = D_16x8;
2494     a->i_cost16x8bi = 0;
2495
2496     for( int i = 0; i < 2; i++ )
2497     {
2498         int i_part_cost;
2499         int i_part_cost_bi = 0;
2500         intptr_t stride[2] = {16,16};
2501         pixel *src[2];
2502         x264_me_t m;
2503         m.i_pixel = PIXEL_16x8;
2504         LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2505
2506         for( int l = 0; l < 2; l++ )
2507         {
2508             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2509             int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2510             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2511             lX->me16x8[i].cost = INT_MAX;
2512             for( int j = 0; j < i_ref8s; j++ )
2513             {
2514                 int i_ref = ref8[j];
2515                 m.i_ref_cost = REF_COST( l, i_ref );
2516
2517                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2518
2519                 CP32( mvc[0], lX->mvc[i_ref][0] );
2520                 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2521                 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2522
2523                 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2524                 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2525                 x264_me_search( h, &m, mvc, 3 );
2526                 m.cost += m.i_ref_cost;
2527
2528                 if( m.cost < lX->me16x8[i].cost )
2529                     h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2530             }
2531         }
2532
2533         /* BI mode */
2534         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2535                                 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2536         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2537                                 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2538         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2539                                 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2540
2541         i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2542                         + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2543                         + a->l1.me16x8[i].i_ref_cost;
2544
2545         if( h->mb.b_chroma_me )
2546             i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2547
2548         i_part_cost = a->l0.me16x8[i].cost;
2549         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2550
2551         if( a->l1.me16x8[i].cost < i_part_cost )
2552         {
2553             i_part_cost = a->l1.me16x8[i].cost;
2554             a->i_mb_partition16x8[i] = D_L1_8x8;
2555         }
2556         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2557         {
2558             i_part_cost = i_part_cost_bi;
2559             a->i_mb_partition16x8[i] = D_BI_8x8;
2560         }
2561         a->i_cost16x8bi += i_part_cost;
2562
2563         /* Early termination based on the current SATD score of partition[0]
2564            plus the estimated SATD score of partition[1] */
2565         if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2566             * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2567         {
2568             a->i_cost16x8bi = COST_MAX;
2569             return;
2570         }
2571
2572         x264_mb_cache_mv_b16x8( h, a, i, 0 );
2573     }
2574
2575     /* mb type cost */
2576     a->i_mb_type16x8 = B_L0_L0
2577         + (a->i_mb_partition16x8[0]>>2) * 3
2578         + (a->i_mb_partition16x8[1]>>2);
2579     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2580 }
2581
2582 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2583 {
2584     ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2585     ALIGNED_4( int16_t mvc[3][2] );
2586
2587     h->mb.i_partition = D_8x16;
2588     a->i_cost8x16bi = 0;
2589
2590     for( int i = 0; i < 2; i++ )
2591     {
2592         int i_part_cost;
2593         int i_part_cost_bi = 0;
2594         intptr_t stride[2] = {8,8};
2595         pixel *src[2];
2596         x264_me_t m;
2597         m.i_pixel = PIXEL_8x16;
2598         LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2599
2600         for( int l = 0; l < 2; l++ )
2601         {
2602             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2603             int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2604             int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2605             lX->me8x16[i].cost = INT_MAX;
2606             for( int j = 0; j < i_ref8s; j++ )
2607             {
2608                 int i_ref = ref8[j];
2609                 m.i_ref_cost = REF_COST( l, i_ref );
2610
2611                 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2612
2613                 CP32( mvc[0], lX->mvc[i_ref][0] );
2614                 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2615                 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2616
2617                 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2618                 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2619                 x264_me_search( h, &m, mvc, 3 );
2620                 m.cost += m.i_ref_cost;
2621
2622                 if( m.cost < lX->me8x16[i].cost )
2623                     h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2624             }
2625         }
2626
2627         /* BI mode */
2628         src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2629                                 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2630         src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2631                                 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2632         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2633
2634         i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2635                         + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2636                         + a->l1.me8x16[i].i_ref_cost;
2637
2638         if( h->mb.b_chroma_me )
2639             i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2640
2641         i_part_cost = a->l0.me8x16[i].cost;
2642         a->i_mb_partition8x16[i] = D_L0_8x8;
2643
2644         if( a->l1.me8x16[i].cost < i_part_cost )
2645         {
2646             i_part_cost = a->l1.me8x16[i].cost;
2647             a->i_mb_partition8x16[i] = D_L1_8x8;
2648         }
2649         if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2650         {
2651             i_part_cost = i_part_cost_bi;
2652             a->i_mb_partition8x16[i] = D_BI_8x8;
2653         }
2654         a->i_cost8x16bi += i_part_cost;
2655
2656         /* Early termination based on the current SATD score of partition[0]
2657            plus the estimated SATD score of partition[1] */
2658         if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2659             * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2660         {
2661             a->i_cost8x16bi = COST_MAX;
2662             return;
2663         }
2664
2665         x264_mb_cache_mv_b8x16( h, a, i, 0 );
2666     }
2667
2668     /* mb type cost */
2669     a->i_mb_type8x16 = B_L0_L0
2670         + (a->i_mb_partition8x16[0]>>2) * 3
2671         + (a->i_mb_partition8x16[1]>>2);
2672     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2673 }
2674
2675 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2676 {
2677     int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2678
2679     h->mb.i_type = P_L0;
2680     if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2681     {
2682         h->mb.i_partition = D_16x16;
2683         x264_analyse_update_cache( h, a );
2684         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2685     }
2686
2687     if( a->l0.i_cost16x8 < thresh )
2688     {
2689         h->mb.i_partition = D_16x8;
2690         x264_analyse_update_cache( h, a );
2691         a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2692     }
2693     else
2694         a->l0.i_cost16x8 = COST_MAX;
2695
2696     if( a->l0.i_cost8x16 < thresh )
2697     {
2698         h->mb.i_partition = D_8x16;
2699         x264_analyse_update_cache( h, a );
2700         a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2701     }
2702     else
2703         a->l0.i_cost8x16 = COST_MAX;
2704
2705     if( a->l0.i_cost8x8 < thresh )
2706     {
2707         h->mb.i_type = P_8x8;
2708         h->mb.i_partition = D_8x8;
2709         if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2710         {
2711             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2712             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2713             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2714             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2715             /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2716              * for future blocks are those left over from previous RDO calls. */
2717             for( int i = 0; i < 4; i++ )
2718             {
2719                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2720                 int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2721                 int subtype, btype = D_L0_8x8;
2722                 uint64_t bcost = COST_MAX64;
2723                 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2724                 {
2725                     uint64_t cost;
2726                     if( costs[subtype] > sub8x8_thresh )
2727                         continue;
2728                     h->mb.i_sub_partition[i] = subtype;
2729                     x264_mb_cache_mv_p8x8( h, a, i );
2730                     if( subtype == btype )
2731                         continue;
2732                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2733                     COPY2_IF_LT( bcost, cost, btype, subtype );
2734                 }
2735                 if( h->mb.i_sub_partition[i] != btype )
2736                 {
2737                     h->mb.i_sub_partition[i] = btype;
2738                     x264_mb_cache_mv_p8x8( h, a, i );
2739                 }
2740             }
2741         }
2742         else
2743             x264_analyse_update_cache( h, a );
2744         a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2745     }
2746     else
2747         a->l0.i_cost8x8 = COST_MAX;
2748 }
2749
2750 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2751 {
2752     int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2753
2754     if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2755     {
2756         h->mb.i_type = B_DIRECT;
2757         /* Assumes direct/skip MC is still in fdec */
2758         /* Requires b-rdo to be done before intra analysis */
2759         h->mb.b_skip_mc = 1;
2760         x264_analyse_update_cache( h, a );
2761         a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2762         h->mb.b_skip_mc = 0;
2763     }
2764
2765     //FIXME not all the update_cache calls are needed
2766     h->mb.i_partition = D_16x16;
2767     /* L0 */
2768     if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2769     {
2770         h->mb.i_type = B_L0_L0;
2771         x264_analyse_update_cache( h, a );
2772         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2773     }
2774
2775     /* L1 */
2776     if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2777     {
2778         h->mb.i_type = B_L1_L1;
2779         x264_analyse_update_cache( h, a );
2780         a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2781     }
2782
2783     /* BI */
2784     if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2785     {
2786         h->mb.i_type = B_BI_BI;
2787         x264_analyse_update_cache( h, a );
2788         a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2789     }
2790
2791     /* 8x8 */
2792     if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2793     {
2794         h->mb.i_type = B_8x8;
2795         h->mb.i_partition = D_8x8;
2796         x264_analyse_update_cache( h, a );
2797         a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2798         x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2799     }
2800
2801     /* 16x8 */
2802     if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2803     {
2804         h->mb.i_type = a->i_mb_type16x8;
2805         h->mb.i_partition = D_16x8;
2806         x264_analyse_update_cache( h, a );
2807         a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2808     }
2809
2810     /* 8x16 */
2811     if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2812     {
2813         h->mb.i_type = a->i_mb_type8x16;
2814         h->mb.i_partition = D_8x16;
2815         x264_analyse_update_cache( h, a );
2816         a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2817     }
2818 }
2819
2820 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2821 {
2822     int i_biweight;
2823
2824     if( IS_INTRA(h->mb.i_type) )
2825         return;
2826
2827     switch( h->mb.i_partition )
2828     {
2829         case D_16x16:
2830             if( h->mb.i_type == B_BI_BI )
2831             {
2832                 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2833                 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2834             }
2835             break;
2836         case D_16x8:
2837             for( int i = 0; i < 2; i++ )
2838                 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2839                 {
2840                     i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2841                     x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2842                 }
2843             break;
2844         case D_8x16:
2845             for( int i = 0; i < 2; i++ )
2846                 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2847                 {
2848                     i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2849                     x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2850                 }
2851             break;
2852         case D_8x8:
2853             for( int i = 0; i < 4; i++ )
2854                 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2855                 {
2856                     i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2857                     x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2858                 }
2859             break;
2860     }
2861 }
2862
2863 static inline void x264_mb_analyse_transform( x264_t *h )
2864 {
2865     if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2866     {
2867         /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2868         x264_mb_mc( h );
2869
2870         int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2871         int i_cost8 = 0, i_cost4 = 0;
2872         /* Not all platforms have a merged SATD function */
2873         if( h->pixf.sa8d_satd[PIXEL_16x16] )
2874         {
2875             uint64_t cost = 0;
2876             for( int p = 0; p < plane_count; p++ )
2877             {
2878                 cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2879                                                         h->mb.pic.p_fdec[p], FDEC_STRIDE );
2880
2881             }
2882             i_cost8 = (uint32_t)cost;
2883             i_cost4 = (uint32_t)(cost >> 32);
2884         }
2885         else
2886         {
2887             for( int p = 0; p < plane_count; p++ )
2888             {
2889                 i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2890                                                       h->mb.pic.p_fdec[p], FDEC_STRIDE );
2891                 i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2892                                                       h->mb.pic.p_fdec[p], FDEC_STRIDE );
2893             }
2894         }
2895
2896         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2897         h->mb.b_skip_mc = 1;
2898     }
2899 }
2900
2901 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2902 {
2903     if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2904     {
2905         uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2906         /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2907         if( h->mb.i_type == P_8x8 )
2908             M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2909         else if( !x264_transform_allowed[h->mb.i_type] )
2910             return;
2911
2912         x264_analyse_update_cache( h, a );
2913         h->mb.b_transform_8x8 ^= 1;
2914         /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2915         int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2916
2917         if( *i_rd >= i_rd8 )
2918         {
2919             if( *i_rd > 0 )
2920                 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2921             *i_rd = i_rd8;
2922         }
2923         else
2924         {
2925             h->mb.b_transform_8x8 ^= 1;
2926             M32( h->mb.i_sub_partition ) = subpart_bak;
2927         }
2928     }
2929 }
2930
2931 /* Rate-distortion optimal QP selection.
2932  * FIXME: More than half of the benefit of this function seems to be
2933  * in the way it improves the coding of chroma DC (by decimating or
2934  * finding a better way to code a single DC coefficient.)
2935  * There must be a more efficient way to get that portion of the benefit
2936  * without doing full QP-RD, but RD-decimation doesn't seem to do the
2937  * trick. */
2938 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2939 {
2940     int bcost, cost, failures, prevcost, origcost;
2941     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2942     int last_qp_tried = 0;
2943     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2944     int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2945
2946     /* If CBP is already zero, don't raise the quantizer any higher. */
2947     for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2948     {
2949         /* Without psy-RD, require monotonicity when moving quant away from previous
2950          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2951          * With psy-RD, allow 1 failure when moving quant away from previous quant,
2952          * allow 2 failures when moving quant towards previous quant.
2953          * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2954         int threshold = (!!h->mb.i_psy_rd);
2955         /* Raise the threshold for failures if we're moving towards the last QP. */
2956         if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2957             ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2958             threshold++;
2959         h->mb.i_qp = orig_qp;
2960         failures = 0;
2961         prevcost = origcost;
2962
2963         /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2964          * (up to a point) will too.  So, jump down to where the threshold will kick in
2965          * and check the QP there.  If the CBP is still empty, skip the main loop.
2966          * If it isn't empty, we would have ended up having to check this QP anyways,
2967          * so as long as we store it for later lookup, we lose nothing. */
2968         int already_checked_qp = -1;
2969         int already_checked_cost = COST_MAX;
2970         if( direction == -1 )
2971         {
2972             if( !origcbp )
2973             {
2974                 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2975                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2976                 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2977                 if( !h->mb.cbp[h->mb.i_mb_xy] )
2978                 {
2979                     /* If our empty-CBP block is lower QP than the last QP,
2980                      * the last QP almost surely doesn't have a CBP either. */
2981                     if( h->mb.i_last_qp > h->mb.i_qp )
2982                         last_qp_tried = 1;
2983                     break;
2984                 }
2985                 already_checked_qp = h->mb.i_qp;
2986                 h->mb.i_qp = orig_qp;
2987             }
2988         }
2989
2990         h->mb.i_qp += direction;
2991         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2992         {
2993             if( h->mb.i_last_qp == h->mb.i_qp )
2994                 last_qp_tried = 1;
2995             if( h->mb.i_qp == already_checked_qp )
2996                 cost = already_checked_cost;
2997             else
2998             {
2999                 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3000                 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3001                 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3002             }
3003
3004             /* We can't assume that the costs are monotonic over QPs.
3005              * Tie case-as-failure seems to give better results. */
3006             if( cost < prevcost )
3007                 failures = 0;
3008             else
3009                 failures++;
3010             prevcost = cost;
3011
3012             if( failures > threshold )
3013                 break;
3014             if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3015                 break;
3016             h->mb.i_qp += direction;
3017         }
3018     }
3019
3020     /* Always try the last block's QP. */
3021     if( !last_qp_tried )
3022     {
3023         h->mb.i_qp = h->mb.i_last_qp;
3024         h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3025         cost = x264_rd_cost_mb( h, a->i_lambda2 );
3026         COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3027     }
3028
3029     h->mb.i_qp = bqp;
3030     h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3031
3032     /* Check transform again; decision from before may no longer be optimal. */
3033     if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3034         x264_mb_transform_8x8_allowed( h ) )
3035     {
3036         h->mb.b_transform_8x8 ^= 1;
3037         cost = x264_rd_cost_mb( h, a->i_lambda2 );
3038         if( cost > bcost )
3039             h->mb.b_transform_8x8 ^= 1;
3040     }
3041 }
3042
3043 /*****************************************************************************
3044  * x264_macroblock_analyse:
3045  *****************************************************************************/
3046 void x264_macroblock_analyse( x264_t *h )
3047 {
3048     x264_mb_analysis_t analysis;
3049     int i_cost = COST_MAX;
3050
3051     h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3052     /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3053      * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
3054     if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3055         h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3056
3057     if( h->param.analyse.b_mb_info )
3058         h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3059     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3060
3061     /*--------------------------- Do the analysis ---------------------------*/
3062     if( h->sh.i_type == SLICE_TYPE_I )
3063     {
3064 intra_analysis:
3065         if( analysis.i_mbrd )
3066             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3067         x264_mb_analyse_intra( h, &analysis, COST_MAX );
3068         if( analysis.i_mbrd )
3069             x264_intra_rd( h, &analysis, COST_MAX );
3070
3071         i_cost = analysis.i_satd_i16x16;
3072         h->mb.i_type = I_16x16;
3073         COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3074         COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3075         if( analysis.i_satd_pcm < i_cost )
3076             h->mb.i_type = I_PCM;
3077
3078         else if( analysis.i_mbrd >= 2 )
3079             x264_intra_rd_refine( h, &analysis );
3080     }
3081     else if( h->sh.i_type == SLICE_TYPE_P )
3082     {
3083         int b_skip = 0;
3084
3085         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3086
3087         analysis.b_try_skip = 0;
3088         if( analysis.b_force_intra )
3089         {
3090             if( !h->param.analyse.b_psy )
3091             {
3092                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3093                 goto intra_analysis;
3094             }
3095         }
3096         else
3097         {
3098             /* Special fast-skip logic using information from mb_info. */
3099             if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3100             {
3101                 if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3102                     h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3103                 {
3104                     h->mb.i_partition = D_16x16;
3105                     /* Use the P-SKIP MV if we can... */
3106                     if( !M32(h->mb.cache.pskip_mv) )
3107                     {
3108                         b_skip = 1;
3109                         h->mb.i_type = P_SKIP;
3110                     }
3111                     /* Otherwise, just force a 16x16 block. */
3112                     else
3113                     {
3114                         h->mb.i_type = P_L0;
3115                         analysis.l0.me16x16.i_ref = 0;
3116                         M32( analysis.l0.me16x16.mv ) = 0;
3117                     }
3118                     goto skip_analysis;
3119                 }
3120                 /* Reset the information accordingly */
3121                 else if( h->param.analyse.b_mb_info_update )
3122                     h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3123             }
3124
3125             int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3126             /* If the current macroblock is off the frame, just skip it. */
3127             if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3128                 b_skip = 1;
3129             /* Fast P_SKIP detection */
3130             else if( h->param.analyse.b_fast_pskip )
3131             {
3132                 if( skip_invalid )
3133                     // FIXME don't need to check this if the reference frame is done
3134                     {}
3135                 else if( h->param.analyse.i_subpel_refine >= 3 )
3136                     analysis.b_try_skip = 1;
3137                 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3138                          h->mb.i_mb_type_top == P_SKIP ||
3139                          h->mb.i_mb_type_topleft == P_SKIP ||
3140                          h->mb.i_mb_type_topright == P_SKIP )
3141                     b_skip = x264_macroblock_probe_pskip( h );
3142             }
3143         }
3144
3145         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3146
3147         if( b_skip )
3148         {
3149             h->mb.i_type = P_SKIP;
3150             h->mb.i_partition = D_16x16;
3151             assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3152 skip_analysis:
3153             /* Set up MVs for future predictors */
3154             for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3155                 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3156         }
3157         else
3158         {
3159             const unsigned int flags = h->param.analyse.inter;
3160             int i_type;
3161             int i_partition;
3162             int i_satd_inter, i_satd_intra;
3163
3164             x264_mb_analyse_load_costs( h, &analysis );
3165
3166             x264_mb_analyse_inter_p16x16( h, &analysis );
3167
3168             if( h->mb.i_type == P_SKIP )
3169             {
3170                 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3171                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3172                 return;
3173             }
3174
3175             if( flags & X264_ANALYSE_PSUB16x16 )
3176             {
3177                 if( h->param.analyse.b_mixed_references )
3178                     x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3179                 else
3180                     x264_mb_analyse_inter_p8x8( h, &analysis );
3181             }
3182
3183             /* Select best inter mode */
3184             i_type = P_L0;
3185             i_partition = D_16x16;
3186             i_cost = analysis.l0.me16x16.cost;
3187
3188             if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3189                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3190             {
3191                 i_type = P_8x8;
3192                 i_partition = D_8x8;
3193                 i_cost = analysis.l0.i_cost8x8;
3194
3195                 /* Do sub 8x8 */
3196                 if( flags & X264_ANALYSE_PSUB8x8 )
3197                 {
3198                     for( int i = 0; i < 4; i++ )
3199                     {
3200                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
3201                         int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3202                         if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3203                         {
3204                             int i_cost8x8 = analysis.l0.i_cost4x4[i];
3205                             h->mb.i_sub_partition[i] = D_L0_4x4;
3206
3207                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
3208                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3209                                          h->mb.i_sub_partition[i], D_L0_8x4 );
3210
3211                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
3212                             COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3213                                          h->mb.i_sub_partition[i], D_L0_4x8 );
3214
3215                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3216                         }
3217                         x264_mb_cache_mv_p8x8( h, &analysis, i );
3218                     }
3219                     analysis.l0.i_cost8x8 = i_cost;
3220                 }
3221             }
3222
3223             /* Now do 16x8/8x16 */
3224             int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3225             if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3226                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3227             {
3228                 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3229                                       + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3230                 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3231
3232                 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3233                 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3234
3235                 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3236                                   + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3237                 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3238
3239                 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3240                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3241             }
3242
3243             h->mb.i_partition = i_partition;
3244
3245             /* refine qpel */
3246             //FIXME mb_type costs?
3247             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3248             {
3249                 /* refine later */
3250             }
3251             else if( i_partition == D_16x16 )
3252             {
3253                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3254                 i_cost = analysis.l0.me16x16.cost;
3255             }
3256             else if( i_partition == D_16x8 )
3257             {
3258                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3259                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3260                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3261             }
3262             else if( i_partition == D_8x16 )
3263             {
3264                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3265                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3266                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3267             }
3268             else if( i_partition == D_8x8 )
3269             {
3270                 i_cost = 0;
3271                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3272                 {
3273                     switch( h->mb.i_sub_partition[i8x8] )
3274                     {
3275                         case D_L0_8x8:
3276                             x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3277                             i_cost += analysis.l0.me8x8[i8x8].cost;
3278                             break;
3279                         case D_L0_8x4:
3280                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3281                             x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3282                             i_cost += analysis.l0.me8x4[i8x8][0].cost +
3283                                       analysis.l0.me8x4[i8x8][1].cost;
3284                             break;
3285                         case D_L0_4x8:
3286                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3287                             x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3288                             i_cost += analysis.l0.me4x8[i8x8][0].cost +
3289                                       analysis.l0.me4x8[i8x8][1].cost;
3290                             break;
3291
3292                         case D_L0_4x4:
3293                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3294                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3295                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3296                             x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3297                             i_cost += analysis.l0.me4x4[i8x8][0].cost +
3298                                       analysis.l0.me4x4[i8x8][1].cost +
3299                                       analysis.l0.me4x4[i8x8][2].cost +
3300                                       analysis.l0.me4x4[i8x8][3].cost;
3301                             break;
3302                         default:
3303                             x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3304                             break;
3305                     }
3306                 }
3307             }
3308
3309             if( h->mb.b_chroma_me )
3310             {
3311                 if( CHROMA444 )
3312                 {
3313                     x264_mb_analyse_intra( h, &analysis, i_cost );
3314                     x264_mb_analyse_intra_chroma( h, &analysis );
3315                 }
3316                 else
3317                 {
3318                     x264_mb_analyse_intra_chroma( h, &analysis );
3319                     x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3320                 }
3321                 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3322                 analysis.i_satd_i8x8   += analysis.i_satd_chroma;
3323                 analysis.i_satd_i4x4   += analysis.i_satd_chroma;
3324             }
3325             else
3326                 x264_mb_analyse_intra( h, &analysis, i_cost );
3327
3328             i_satd_inter = i_cost;
3329             i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3330                                       analysis.i_satd_i8x8,
3331                                       analysis.i_satd_i4x4 );
3332
3333             if( analysis.i_mbrd )
3334             {
3335                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3336                 i_type = P_L0;
3337                 i_partition = D_16x16;
3338                 i_cost = analysis.l0.i_rd16x16;
3339                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3340                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3341                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3342                 h->mb.i_type = i_type;
3343                 h->mb.i_partition = i_partition;
3344                 if( i_cost < COST_MAX )
3345                     x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3346                 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3347             }
3348
3349             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3350             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3351             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3352             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3353
3354             h->mb.i_type = i_type;
3355
3356             if( analysis.b_force_intra && !IS_INTRA(i_type) )
3357             {
3358                 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3359                  * it was an inter block. */
3360                 x264_analyse_update_cache( h, &analysis );
3361                 x264_macroblock_encode( h );
3362                 for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3363                     h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3364                 if( !CHROMA444 )
3365                 {
3366                     int height = 16 >> CHROMA_V_SHIFT;
3367                     h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3368                     h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3369                 }
3370                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3371                 goto intra_analysis;
3372             }
3373
3374             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3375             {
3376                 if( IS_INTRA( h->mb.i_type ) )
3377                 {
3378                     x264_intra_rd_refine( h, &analysis );
3379                 }
3380                 else if( i_partition == D_16x16 )
3381                 {
3382                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3383                     analysis.l0.me16x16.cost = i_cost;
3384                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3385                 }
3386                 else if( i_partition == D_16x8 )
3387                 {
3388                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3389                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3390                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3391                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3392                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3393                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3394                 }
3395                 else if( i_partition == D_8x16 )
3396                 {
3397                     h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3398                     h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3399                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3400                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3401                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3402                     x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3403                 }
3404                 else if( i_partition == D_8x8 )
3405                 {
3406                     x264_analyse_update_cache( h, &analysis );
3407                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3408                     {
3409                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3410                         {
3411                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3412                         }
3413                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3414                         {
3415                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3416                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3417                         }
3418                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3419                         {
3420                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3421                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3422                         }
3423                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3424                         {
3425                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3426                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3427                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3428                             x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3429                         }
3430                     }
3431                 }
3432             }
3433         }
3434     }
3435     else if( h->sh.i_type == SLICE_TYPE_B )
3436     {
3437         int i_bskip_cost = COST_MAX;
3438         int b_skip = 0;
3439
3440         if( analysis.i_mbrd )
3441             x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3442
3443         h->mb.i_type = B_SKIP;
3444         if( h->mb.b_direct_auto_write )
3445         {
3446             /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3447             for( int i = 0; i < 2; i++ )
3448             {
3449                 int b_changed = 1;
3450                 h->sh.b_direct_spatial_mv_pred ^= 1;
3451                 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3452                 if( analysis.b_direct_available )
3453                 {
3454                     if( b_changed )
3455                     {
3456                         x264_mb_mc( h );
3457                         b_skip = x264_macroblock_probe_bskip( h );
3458                     }
3459                     h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3460                 }
3461                 else
3462                     b_skip = 0;
3463             }
3464         }
3465         else
3466             analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3467
3468         analysis.b_try_skip = 0;
3469         if( analysis.b_direct_available )
3470         {
3471             if( !h->mb.b_direct_auto_write )
3472                 x264_mb_mc( h );
3473             /* If the current macroblock is off the frame, just skip it. */
3474             if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3475                 b_skip = 1;
3476             else if( analysis.i_mbrd )
3477             {
3478                 i_bskip_cost = ssd_mb( h );
3479                 /* 6 = minimum cavlc cost of a non-skipped MB */
3480                 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3481             }
3482             else if( !h->mb.b_direct_auto_write )
3483             {
3484                 /* Conditioning the probe on neighboring block types
3485                  * doesn't seem to help speed or quality. */
3486                 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3487                 if( h->param.analyse.i_subpel_refine < 3 )
3488                     b_skip = analysis.b_try_skip;
3489             }
3490             /* Set up MVs for future predictors */
3491             if( b_skip )
3492             {
3493                 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3494                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3495                 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3496                     M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3497             }
3498         }
3499
3500         if( !b_skip )
3501         {
3502             const unsigned int flags = h->param.analyse.inter;
3503             int i_type;
3504             int i_partition;
3505             int i_satd_inter;
3506             h->mb.b_skip_mc = 0;
3507             h->mb.i_type = B_DIRECT;
3508
3509             x264_mb_analyse_load_costs( h, &analysis );
3510
3511             /* select best inter mode */
3512             /* direct must be first */
3513             if( analysis.b_direct_available )
3514                 x264_mb_analyse_inter_direct( h, &analysis );
3515
3516             x264_mb_analyse_inter_b16x16( h, &analysis );
3517
3518             if( h->mb.i_type == B_SKIP )
3519             {
3520                 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3521                     M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3522                 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3523                     M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3524                 return;
3525             }
3526
3527             i_type = B_L0_L0;
3528             i_partition = D_16x16;
3529             i_cost = analysis.l0.me16x16.cost;
3530             COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3531             COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3532             COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3533
3534             if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3535             {
3536                 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3537                 if( i_bskip_cost < analysis.i_rd16x16direct &&
3538                     i_bskip_cost < analysis.i_rd16x16bi &&
3539                     i_bskip_cost < analysis.l0.i_rd16x16 &&
3540                     i_bskip_cost < analysis.l1.i_rd16x16 )
3541                 {
3542                     h->mb.i_type = B_SKIP;
3543                     x264_analyse_update_cache( h, &analysis );
3544                     return;
3545                 }
3546             }
3547
3548             if( flags & X264_ANALYSE_BSUB16x16 )
3549             {
3550                 if( h->param.analyse.b_mixed_references )
3551                     x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3552                 else
3553                     x264_mb_analyse_inter_b8x8( h, &analysis );
3554
3555                 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3556
3557                 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3558                 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3559                 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3560                 for( int i = 0; i < 2; i++ )
3561                 {
3562                     int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3563                     int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3564                     // 16x8
3565                     i_best_cost = COST_MAX;
3566                     i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3567                     i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3568                     i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3569                     avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3570                                          + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3571                     avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3572                                          + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3573                     COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3574                     COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3575                     COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3576                     analysis.i_cost_est16x8[i] = i_best_cost;
3577
3578                     // 8x16
3579                     i_best_cost = COST_MAX;
3580                     i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3581                     i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3582                     i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3583                     avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3584                                          + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3585                     avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3586                                          + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3587                     COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3588                     COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3589                     COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3590                     analysis.i_cost_est8x16[i] = i_best_cost;
3591                 }
3592                 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3593                 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3594                 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3595                 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3596                 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3597                 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3598
3599                 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3600                 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3601                 if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3602                 {
3603                     x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3604                     COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3605                 }
3606                 if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3607                 {
3608                     x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3609                     COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3610                 }
3611                 if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3612                 {
3613                     x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3614                     COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3615                 }
3616             }
3617
3618             if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3619             {
3620                 /* refine later */
3621             }
3622             /* refine qpel */
3623             else if( i_partition == D_16x16 )
3624             {
3625                 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3626                 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3627                 if( i_type == B_L0_L0 )
3628                 {
3629                     x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3630                     i_cost = analysis.l0.me16x16.cost
3631                            + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3632                 }
3633                 else if( i_type == B_L1_L1 )
3634                 {
3635                     x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3636                     i_cost = analysis.l1.me16x16.cost
3637                            + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3638                 }
3639                 else if( i_type == B_BI_BI )
3640                 {
3641                     x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3642                     x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3643                 }
3644             }
3645             else if( i_partition == D_16x8 )
3646             {
3647                 for( int i = 0; i < 2; i++ )
3648                 {
3649                     if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3650                         x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3651                     if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3652                         x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3653                 }
3654             }
3655             else if( i_partition == D_8x16 )
3656             {
3657                 for( int i = 0; i < 2; i++ )
3658                 {
3659                     if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3660                         x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3661                     if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3662                         x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3663                 }
3664             }
3665             else if( i_partition == D_8x8 )
3666             {
3667                 for( int i = 0; i < 4; i++ )
3668                 {
3669                     x264_me_t *m;
3670                     int i_part_cost_old;
3671                     int i_type_cost;
3672                     int i_part_type = h->mb.i_sub_partition[i];
3673                     int b_bidir = (i_part_type == D_BI_8x8);
3674
3675                     if( i_part_type == D_DIRECT_8x8 )
3676                         continue;
3677                     if( x264_mb_partition_listX_table[0][i_part_type] )
3678                     {
3679                         m = &analysis.l0.me8x8[i];
3680                         i_part_cost_old = m->cost;
3681                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3682                         m->cost -= i_type_cost;
3683                         x264_me_refine_qpel( h, m );
3684                         if( !b_bidir )
3685                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3686                     }
3687                     if( x264_mb_partition_listX_table[1][i_part_type] )
3688                     {
3689                         m = &analysis.l1.me8x8[i];
3690                         i_part_cost_old = m->cost;
3691                         i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3692                         m->cost -= i_type_cost;
3693                         x264_me_refine_qpel( h, m );
3694                         if( !b_bidir )
3695                             analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3696                     }
3697                     /* TODO: update mvp? */
3698                 }
3699             }
3700
3701             i_satd_inter = i_cost;
3702
3703             if( analysis.i_mbrd )
3704             {
3705                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3706                 i_type = B_SKIP;
3707                 i_cost = i_bskip_cost;
3708                 i_partition = D_16x16;
3709                 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3710                 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3711                 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3712                 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3713                 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3714                 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3715                 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3716
3717                 h->mb.i_type = i_type;
3718                 h->mb.i_partition = i_partition;
3719             }
3720
3721             if( h->mb.b_chroma_me )
3722             {
3723                 if( CHROMA444 )
3724                 {
3725                     x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3726                     x264_mb_analyse_intra_chroma( h, &analysis );
3727                 }
3728                 else
3729                 {
3730                     x264_mb_analyse_intra_chroma( h, &analysis );
3731                     x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3732                 }
3733                 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3734                 analysis.i_satd_i8x8   += analysis.i_satd_chroma;
3735                 analysis.i_satd_i4x4   += analysis.i_satd_chroma;
3736             }
3737             else
3738                 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3739
3740             if( analysis.i_mbrd )
3741             {
3742                 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3743                 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3744             }
3745
3746             COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3747             COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3748             COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3749             COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3750
3751             h->mb.i_type = i_type;
3752             h->mb.i_partition = i_partition;
3753
3754             if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3755                 x264_intra_rd_refine( h, &analysis );
3756             if( h->mb.i_subpel_refine >= 5 )
3757                 x264_refine_bidir( h, &analysis );
3758
3759             if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3760             {
3761                 int i_biweight;
3762                 x264_analyse_update_cache( h, &analysis );
3763
3764                 if( i_partition == D_16x16 )
3765                 {
3766                     if( i_type == B_L0_L0 )
3767                     {
3768                         analysis.l0.me16x16.cost = i_cost;
3769                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3770                     }
3771                     else if( i_type == B_L1_L1 )
3772                     {
3773                         analysis.l1.me16x16.cost = i_cost;
3774                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3775                     }
3776                     else if( i_type == B_BI_BI )
3777                     {
3778                         i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3779                         x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3780                     }
3781                 }
3782                 else if( i_partition == D_16x8 )
3783                 {
3784                     for( int i = 0; i < 2; i++ )
3785                     {
3786                         h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3787                         if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3788                             x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3789                         else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3790                             x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3791                         else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3792                         {
3793                             i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3794                             x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3795                         }
3796                     }
3797                 }
3798                 else if( i_partition == D_8x16 )
3799                 {
3800                     for( int i = 0; i < 2; i++ )
3801                     {
3802                         h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3803                         if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3804                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3805                         else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3806                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3807                         else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3808                         {
3809                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3810                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3811                         }
3812                     }
3813                 }
3814                 else if( i_partition == D_8x8 )
3815                 {
3816                     for( int i = 0; i < 4; i++ )
3817                     {
3818                         if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3819                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3820                         else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3821                             x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3822                         else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3823                         {
3824                             i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3825                             x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3826                         }
3827                     }
3828                 }
3829             }
3830         }
3831     }
3832
3833     x264_analyse_update_cache( h, &analysis );
3834
3835     /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3836      * without realizing it.  Check for this and account for it if necessary. */
3837     if( analysis.i_mbrd >= 2 )
3838     {
3839         /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3840         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3841         int list = check_mv_lists[h->mb.i_type] - 1;
3842         if( list >= 0 && h->mb.i_partition != D_16x16 &&
3843             M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3844             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3845                 h->mb.i_partition = D_16x16;
3846     }
3847
3848     if( !analysis.i_mbrd )
3849         x264_mb_analyse_transform( h );
3850
3851     if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3852         x264_mb_analyse_qp_rd( h, &analysis );
3853
3854     h->mb.b_trellis = h->param.analyse.i_trellis;
3855     h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3856
3857     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3858         x264_psy_trellis_init( h, 0 );
3859     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3860         h->mb.i_skip_intra = 0;
3861 }
3862
3863 /*-------------------- Update MB from the analysis ----------------------*/
3864 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3865 {
3866     switch( h->mb.i_type )
3867     {
3868         case I_4x4:
3869             for( int i = 0; i < 16; i++ )
3870                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3871
3872             x264_mb_analyse_intra_chroma( h, a );
3873             break;
3874         case I_8x8:
3875             for( int i = 0; i < 4; i++ )
3876                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3877
3878             x264_mb_analyse_intra_chroma( h, a );
3879             break;
3880         case I_16x16:
3881             h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3882             x264_mb_analyse_intra_chroma( h, a );
3883             break;
3884
3885         case I_PCM:
3886             break;
3887
3888         case P_L0:
3889             switch( h->mb.i_partition )
3890             {
3891                 case D_16x16:
3892                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3893                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3894                     break;
3895
3896                 case D_16x8:
3897                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3898                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3899                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3900                     x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3901                     break;
3902
3903                 case D_8x16:
3904                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3905                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3906                     x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3907                     x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3908                     break;
3909
3910                 default:
3911                     x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3912                     break;
3913             }
3914             break;
3915
3916         case P_8x8:
3917             x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3918             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3919             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3920             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3921             for( int i = 0; i < 4; i++ )
3922                 x264_mb_cache_mv_p8x8( h, a, i );
3923             break;
3924
3925         case P_SKIP:
3926         {
3927             h->mb.i_partition = D_16x16;
3928             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3929             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3930             break;
3931         }
3932
3933         case B_SKIP:
3934         case B_DIRECT:
3935             h->mb.i_partition = h->mb.cache.direct_partition;
3936             x264_mb_load_mv_direct8x8( h, 0 );
3937             x264_mb_load_mv_direct8x8( h, 1 );
3938             x264_mb_load_mv_direct8x8( h, 2 );
3939             x264_mb_load_mv_direct8x8( h, 3 );
3940             break;
3941
3942         case B_8x8:
3943             /* optimize: cache might not need to be rewritten */
3944             for( int i = 0; i < 4; i++ )
3945                 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3946             break;
3947
3948         default: /* the rest of the B types */
3949             switch( h->mb.i_partition )
3950             {
3951             case D_16x16:
3952                 switch( h->mb.i_type )
3953                 {
3954                 case B_L0_L0:
3955                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3956                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3957
3958                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3959                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3960                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3961                     break;
3962                 case B_L1_L1:
3963                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3964                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3965                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3966
3967                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3968                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3969                     break;
3970                 case B_BI_BI:
3971                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3972                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3973
3974                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3975                     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3976                     break;
3977                 }
3978                 break;
3979             case D_16x8:
3980                 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3981                 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3982                 break;
3983             case D_8x16:
3984                 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3985                 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3986                 break;
3987             default:
3988                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3989                 break;
3990             }
3991     }
3992
3993 #ifndef NDEBUG
3994     if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3995     {
3996         for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3997         {
3998             int completed;
3999             int ref = h->mb.cache.ref[l][x264_scan8[0]];
4000             if( ref < 0 )
4001                 continue;
4002             completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
4003             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
4004             {
4005                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
4006                 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
4007                 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
4008                                 h->mb.cache.mv[l][x264_scan8[15]][0],
4009                                 h->mb.cache.mv[l][x264_scan8[15]][1] );
4010                 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
4011                 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
4012                 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4013                 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4014                 x264_mb_analyse_intra( h, a, COST_MAX );
4015                 h->mb.i_type = I_16x16;
4016                 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4017                 x264_mb_analyse_intra_chroma( h, a );
4018             }
4019         }
4020     }
4021 #endif
4022 }
4023
4024 #include "slicetype.c"
4025