1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref0;
80 uint16_t *p_cost_ref1;
85 /* Take some shortcuts in intra search if intra is deemed unlikely */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[4];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const int x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const int i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const int i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const int i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 /* initialize an array of lambda*nbits for all possible mvs */
289 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
291 a->p_cost_mv = h->cost_mv[a->i_lambda];
292 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
293 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
296 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
298 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
300 /* mbrd == 1 -> RD mode decision */
301 /* mbrd == 2 -> RD refinement */
302 /* mbrd == 3 -> QPRD */
303 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
305 /* conduct the analysis using this lamda and QP */
306 a->i_qp = h->mb.i_qp = i_qp;
307 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
309 a->i_lambda = x264_lambda_tab[i_qp];
310 a->i_lambda2 = x264_lambda2_tab[i_qp];
312 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
313 if( h->param.analyse.i_trellis )
315 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
316 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
317 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
318 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
320 h->mb.i_psy_rd_lambda = a->i_lambda;
321 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
322 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
324 h->mb.i_me_method = h->param.analyse.i_me_method;
325 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
326 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
327 && h->mb.i_subpel_refine >= 5;
329 h->mb.b_transform_8x8 = 0;
330 h->mb.b_noise_reduction = 0;
336 a->i_satd_i8x8chroma = COST_MAX;
338 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
339 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
343 h->mb.b_lossless ? 0 :
345 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
347 /* II: Inter part P/B frame */
348 if( h->sh.i_type != SLICE_TYPE_I )
351 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
352 // limit motion search to a slightly smaller range than the theoretical limit,
353 // since the search may go a few iterations past its given range
354 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
356 /* Calculate max allowed MV range */
357 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
358 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
359 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
360 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
361 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
362 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
363 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
364 if( h->mb.i_mb_x == 0)
366 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
367 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
368 int thread_mvy_range = i_fmv_range;
370 if( h->param.i_threads > 1 )
372 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
373 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
374 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
376 x264_frame_t **fref = i ? h->fref1 : h->fref0;
377 int i_ref = i ? h->i_ref1 : h->i_ref0;
378 for( j=0; j<i_ref; j++ )
380 x264_frame_cond_wait( fref[j]->orig, thresh );
381 fref[j]->i_lines_completed = fref[j]->orig->i_lines_completed;
382 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
386 if( h->param.b_deterministic )
387 thread_mvy_range = h->param.analyse.i_mv_range_thread;
388 if( h->mb.b_interlaced )
389 thread_mvy_range >>= 1;
391 for( j=0; j<h->i_ref0; j++ )
393 if( h->sh.weight[j][0].weightfn )
395 x264_frame_t *frame = h->fref0[j];
396 int width = frame->i_width[0] + 2*PADH;
397 int i_padv = PADV << h->param.b_interlaced;
399 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
401 height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
402 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
403 h->fenc->i_lines_weighted += height;
406 for( k = j; k < h->i_ref0; k++ )
407 if( h->sh.weight[k][0].weightfn )
409 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
410 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
411 src + offset, frame->i_stride[0],
412 width, height, &h->sh.weight[k][0] );
420 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
421 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
422 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
423 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
424 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
425 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
426 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
432 a->l0.i_cost8x8 = COST_MAX;
434 for( i = 0; i < 4; i++ )
438 a->l0.i_cost4x8[i] = COST_MAX;
442 a->l0.i_cost8x16 = COST_MAX;
443 if( h->sh.i_type == SLICE_TYPE_B )
447 a->l1.i_cost8x8 = COST_MAX;
449 for( i = 0; i < 4; i++ )
454 a->i_cost8x8direct[i] = COST_MAX;
465 a->i_cost16x16direct =
468 a->i_cost8x16bi = COST_MAX;
471 /* Fast intra decision */
472 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
474 if( IS_INTRA( h->mb.i_mb_type_left )
475 || IS_INTRA( h->mb.i_mb_type_top )
476 || IS_INTRA( h->mb.i_mb_type_topleft )
477 || IS_INTRA( h->mb.i_mb_type_topright )
478 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
479 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
480 { /* intra is likely */ }
496 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
498 int b_top = i_neighbour & MB_TOP;
499 int b_left = i_neighbour & MB_LEFT;
500 if( b_top && b_left )
502 /* top and left available */
503 *mode++ = I_PRED_16x16_V;
504 *mode++ = I_PRED_16x16_H;
505 *mode++ = I_PRED_16x16_DC;
507 if( i_neighbour & MB_TOPLEFT )
509 /* top left available*/
510 *mode++ = I_PRED_16x16_P;
517 *mode++ = I_PRED_16x16_DC_LEFT;
518 *mode++ = I_PRED_16x16_H;
524 *mode++ = I_PRED_16x16_DC_TOP;
525 *mode++ = I_PRED_16x16_V;
531 *mode = I_PRED_16x16_DC_128;
537 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
539 int b_top = i_neighbour & MB_TOP;
540 int b_left = i_neighbour & MB_LEFT;
541 if( b_top && b_left )
543 /* top and left available */
544 *mode++ = I_PRED_CHROMA_V;
545 *mode++ = I_PRED_CHROMA_H;
546 *mode++ = I_PRED_CHROMA_DC;
548 if( i_neighbour & MB_TOPLEFT )
550 /* top left available */
551 *mode++ = I_PRED_CHROMA_P;
558 *mode++ = I_PRED_CHROMA_DC_LEFT;
559 *mode++ = I_PRED_CHROMA_H;
565 *mode++ = I_PRED_CHROMA_DC_TOP;
566 *mode++ = I_PRED_CHROMA_V;
572 *mode = I_PRED_CHROMA_DC_128;
578 static void predict_4x4_mode_available( unsigned int i_neighbour,
579 int *mode, int *pi_count )
581 int b_top = i_neighbour & MB_TOP;
582 int b_left = i_neighbour & MB_LEFT;
583 if( b_top && b_left )
586 *mode++ = I_PRED_4x4_DC;
587 *mode++ = I_PRED_4x4_H;
588 *mode++ = I_PRED_4x4_V;
589 *mode++ = I_PRED_4x4_DDL;
590 if( i_neighbour & MB_TOPLEFT )
592 *mode++ = I_PRED_4x4_DDR;
593 *mode++ = I_PRED_4x4_VR;
594 *mode++ = I_PRED_4x4_HD;
597 *mode++ = I_PRED_4x4_VL;
598 *mode++ = I_PRED_4x4_HU;
602 *mode++ = I_PRED_4x4_DC_LEFT;
603 *mode++ = I_PRED_4x4_H;
604 *mode++ = I_PRED_4x4_HU;
609 *mode++ = I_PRED_4x4_DC_TOP;
610 *mode++ = I_PRED_4x4_V;
611 *mode++ = I_PRED_4x4_DDL;
612 *mode++ = I_PRED_4x4_VL;
617 *mode++ = I_PRED_4x4_DC_128;
622 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
623 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
625 ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
626 ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
627 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
630 if( do_both_dct || h->mb.b_transform_8x8 )
632 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
633 for( i = 0; i < 4; i++ )
634 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
636 if( do_both_dct || !h->mb.b_transform_8x8 )
638 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
639 for( i = 0; i < 16; i++ )
640 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
644 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
645 static inline void x264_mb_cache_fenc_satd( x264_t *h )
647 ALIGNED_16( static uint8_t zero[16] ) = {0};
649 int x, y, satd_sum = 0, sa8d_sum = 0;
650 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
651 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
652 if( !h->mb.i_psy_rd )
654 for( y = 0; y < 4; y++ )
655 for( x = 0; x < 4; x++ )
657 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
658 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
659 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
660 satd_sum += h->mb.pic.fenc_satd[y][x];
662 for( y = 0; y < 2; y++ )
663 for( x = 0; x < 2; x++ )
665 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
666 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
667 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
668 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
670 h->mb.pic.fenc_satd_sum = satd_sum;
671 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
674 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
680 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
682 uint8_t *p_dstc[2], *p_srcc[2];
684 if( a->i_satd_i8x8chroma < COST_MAX )
687 /* 8x8 prediction selection for chroma */
688 p_dstc[0] = h->mb.pic.p_fdec[1];
689 p_dstc[1] = h->mb.pic.p_fdec[2];
690 p_srcc[0] = h->mb.pic.p_fenc[1];
691 p_srcc[1] = h->mb.pic.p_fenc[2];
693 predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
694 a->i_satd_i8x8chroma = COST_MAX;
695 if( i_max == 4 && b_merged_satd )
697 int satdu[4], satdv[4];
698 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
699 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
700 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
701 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
702 satdu[I_PRED_CHROMA_P] =
703 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
704 satdv[I_PRED_CHROMA_P] =
705 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
707 for( i=0; i<i_max; i++ )
709 int i_mode = predict_mode[i];
710 int i_satd = satdu[i_mode] + satdv[i_mode]
711 + a->i_lambda * bs_size_ue(i_mode);
713 a->i_satd_i8x8chroma_dir[i] = i_satd;
714 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
719 for( i=0; i<i_max; i++ )
722 int i_mode = predict_mode[i];
724 /* we do the prediction */
725 if( h->mb.b_lossless )
726 x264_predict_lossless_8x8_chroma( h, i_mode );
729 h->predict_8x8c[i_mode]( p_dstc[0] );
730 h->predict_8x8c[i_mode]( p_dstc[1] );
733 /* we calculate the cost */
734 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
735 p_srcc[0], FENC_STRIDE ) +
736 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
737 p_srcc[1], FENC_STRIDE ) +
738 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
740 a->i_satd_i8x8chroma_dir[i] = i_satd;
741 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
745 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
748 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
750 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
751 uint8_t *p_src = h->mb.pic.p_fenc[0];
752 uint8_t *p_dst = h->mb.pic.p_fdec[0];
757 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
759 /*---------------- Try all mode and calculate their score ---------------*/
761 /* 16x16 prediction selection */
762 predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
764 if( b_merged_satd && i_max == 4 )
766 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
767 h->predict_16x16[I_PRED_16x16_P]( p_dst );
768 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
769 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
772 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
773 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
778 for( i = 0; i < i_max; i++ )
781 int i_mode = predict_mode[i];
783 if( h->mb.b_lossless )
784 x264_predict_lossless_16x16( h, i_mode );
786 h->predict_16x16[i_mode]( p_dst );
788 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
789 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
790 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
791 a->i_satd_i16x16_dir[i_mode] = i_satd;
795 if( h->sh.i_type == SLICE_TYPE_B )
796 /* cavlc mb type prefix */
797 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
798 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
801 /* 8x8 prediction selection */
802 if( flags & X264_ANALYSE_I8x8 )
804 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
805 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
806 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
808 h->mb.i_cbp_luma = 0;
809 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
811 // FIXME some bias like in i4x4?
812 if( h->sh.i_type == SLICE_TYPE_B )
813 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
815 for( idx = 0;; idx++ )
819 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
820 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
821 int i_best = COST_MAX;
822 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
824 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
825 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
827 if( b_merged_satd && i_max == 9 )
830 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
831 satd[i_pred_mode] -= 3 * a->i_lambda;
832 for( i=2; i>=0; i-- )
834 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
835 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
842 for( ; i<i_max; i++ )
845 int i_mode = predict_mode[i];
847 if( h->mb.b_lossless )
848 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
850 h->predict_8x8[i_mode]( p_dst_by, edge );
852 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
853 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
854 i_satd -= a->i_lambda * 3;
856 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
857 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
861 if( idx == 3 || i_cost > i_satd_thresh )
864 /* we need to encode this block now (for next ones) */
865 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
866 x264_mb_encode_i8x8( h, idx, a->i_qp );
868 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
873 a->i_satd_i8x8 = i_cost;
874 if( h->mb.i_skip_intra )
876 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
877 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
878 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
879 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
880 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
881 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
882 if( h->mb.i_skip_intra == 2 )
883 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
888 static const uint16_t cost_div_fix8[3] = {1024,512,341};
889 a->i_satd_i8x8 = COST_MAX;
890 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
892 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
896 /* 4x4 prediction selection */
897 if( flags & X264_ANALYSE_I4x4 )
900 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
901 h->mb.i_cbp_luma = 0;
902 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
904 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
906 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
907 if( h->sh.i_type == SLICE_TYPE_B )
908 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
910 for( idx = 0;; idx++ )
912 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
913 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
914 int i_best = COST_MAX;
915 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
917 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
919 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
920 /* emulate missing topright samples */
921 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
923 if( b_merged_satd && i_max >= 6 )
926 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
927 satd[i_pred_mode] -= 3 * a->i_lambda;
928 for( i=2; i>=0; i-- )
929 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
935 for( ; i<i_max; i++ )
938 int i_mode = predict_mode[i];
939 if( h->mb.b_lossless )
940 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
942 h->predict_4x4[i_mode]( p_dst_by );
944 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
945 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
946 i_satd -= a->i_lambda * 3;
948 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
950 i_cost += i_best + 4 * a->i_lambda;
952 if( i_cost > i_satd_thresh || idx == 15 )
955 /* we need to encode this block now (for next ones) */
956 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
957 x264_mb_encode_i4x4( h, idx, a->i_qp );
959 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
963 a->i_satd_i4x4 = i_cost;
964 if( h->mb.i_skip_intra )
966 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
967 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
968 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
969 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
970 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
971 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
972 if( h->mb.i_skip_intra == 2 )
973 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
977 a->i_satd_i4x4 = COST_MAX;
981 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
983 if( a->i_satd_i16x16 <= i_satd_thresh )
985 h->mb.i_type = I_16x16;
986 x264_analyse_update_cache( h, a );
987 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
990 a->i_satd_i16x16 = COST_MAX;
992 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
994 h->mb.i_type = I_4x4;
995 x264_analyse_update_cache( h, a );
996 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
999 a->i_satd_i4x4 = COST_MAX;
1001 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
1003 h->mb.i_type = I_8x8;
1004 x264_analyse_update_cache( h, a );
1005 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1006 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1009 a->i_satd_i8x8 = COST_MAX;
1012 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1014 uint8_t *p_dst = h->mb.pic.p_fdec[0];
1016 int i, j, idx, x, y;
1017 int i_max, i_mode, i_thresh;
1018 uint64_t i_satd, i_best;
1019 int predict_mode[9];
1020 h->mb.i_skip_intra = 0;
1022 if( h->mb.i_type == I_16x16 )
1024 int old_pred_mode = a->i_predict16x16;
1025 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
1026 i_best = a->i_satd_i16x16;
1027 predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
1028 for( i = 0; i < i_max; i++ )
1030 int i_mode = predict_mode[i];
1031 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1033 h->mb.i_intra16x16_pred_mode = i_mode;
1034 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1035 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1039 /* RD selection for chroma prediction */
1040 predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
1043 i_thresh = a->i_satd_i8x8chroma * 5/4;
1045 for( i = j = 0; i < i_max; i++ )
1046 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
1047 predict_mode[i] != a->i_predict8x8chroma )
1049 predict_mode[j++] = predict_mode[i];
1055 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1056 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1057 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1058 * coefs for the current chroma mode are still around, so we only
1059 * have to recount the bits. */
1060 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1061 for( i = 0; i < i_max; i++ )
1063 i_mode = predict_mode[i];
1064 if( h->mb.b_lossless )
1065 x264_predict_lossless_8x8_chroma( h, i_mode );
1068 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1069 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1071 /* if we've already found a mode that needs no residual, then
1072 * probably any mode with a residual will be worse.
1073 * so avoid dct on the remaining modes to improve speed. */
1074 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1075 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1077 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1078 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1082 if( h->mb.i_type == I_4x4 )
1084 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1086 for( idx = 0; idx < 16; idx++ )
1088 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1089 i_best = COST_MAX64;
1091 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1093 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1094 /* emulate missing topright samples */
1095 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1097 for( i = 0; i < i_max; i++ )
1099 i_mode = predict_mode[i];
1100 if( h->mb.b_lossless )
1101 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1103 h->predict_4x4[i_mode]( p_dst_by );
1104 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1106 if( i_best > i_satd )
1108 a->i_predict4x4[idx] = i_mode;
1110 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1111 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1112 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1113 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1114 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1118 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1119 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1120 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1121 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1122 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1124 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1127 else if( h->mb.i_type == I_8x8 )
1129 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1130 for( idx = 0; idx < 4; idx++ )
1132 uint64_t pels_h = 0;
1134 uint16_t i_nnz[2] = {0}; //shut up gcc
1137 int cbp_luma_new = 0;
1138 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1140 i_best = COST_MAX64;
1144 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1145 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1146 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1148 for( i = 0; i < i_max; i++ )
1150 i_mode = predict_mode[i];
1151 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1153 if( h->mb.b_lossless )
1154 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1156 h->predict_8x8[i_mode]( p_dst_by, edge );
1157 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1158 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1160 if( i_best > i_satd )
1162 a->i_predict8x8[idx] = i_mode;
1163 cbp_luma_new = h->mb.i_cbp_luma;
1166 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1168 for( j=0; j<7; j++ )
1169 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1170 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1171 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1174 a->i_cbp_i8x8_luma = cbp_luma_new;
1175 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1177 for( j=0; j<7; j++ )
1178 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1179 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1180 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1182 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1187 #define LOAD_FENC( m, src, xoff, yoff) \
1188 (m)->p_cost_mv = a->p_cost_mv; \
1189 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1190 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1191 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1192 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1193 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1195 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1196 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1197 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1198 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1199 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1200 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1201 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1202 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1203 (m)->weight = weight_none; \
1206 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1207 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1208 (m)->weight = h->sh.weight[i_ref];
1210 #define REF_COST(list, ref) \
1211 (a->p_cost_ref##list[ref])
1213 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1217 ALIGNED_4( int16_t mvc[8][2] );
1218 int i_halfpel_thresh = INT_MAX;
1219 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1221 /* 16x16 Search on all ref frame */
1222 m.i_pixel = PIXEL_16x16;
1223 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1225 a->l0.me16x16.cost = INT_MAX;
1226 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1228 const int i_ref_cost = REF_COST( 0, i_ref );
1229 i_halfpel_thresh -= i_ref_cost;
1230 m.i_ref_cost = i_ref_cost;
1232 /* search with ref */
1233 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1234 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1236 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1237 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1238 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1240 /* early termination
1241 * SSD threshold would probably be better than SATD */
1244 && m.cost-m.cost_mv < 300*a->i_lambda
1245 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1246 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1247 && x264_macroblock_probe_pskip( h ) )
1249 h->mb.i_type = P_SKIP;
1250 x264_analyse_update_cache( h, a );
1251 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1255 m.cost += i_ref_cost;
1256 i_halfpel_thresh += i_ref_cost;
1258 if( m.cost < a->l0.me16x16.cost )
1259 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1261 /* save mv for predicting neighbors */
1262 CP32( a->l0.mvc[i_ref][0], m.mv );
1263 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1266 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1267 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1269 h->mb.i_type = P_L0;
1272 x264_mb_cache_fenc_satd( h );
1273 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) )
1275 h->mb.i_partition = D_16x16;
1276 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1277 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1278 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1279 h->mb.i_type = P_SKIP;
1284 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1288 uint8_t **p_fenc = h->mb.pic.p_fenc;
1289 int i_halfpel_thresh = INT_MAX;
1290 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1292 int i_maxref = h->mb.pic.i_fref[0]-1;
1294 h->mb.i_partition = D_8x8;
1296 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1297 * than those used by the neighbors */
1298 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1299 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1302 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1303 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1304 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1305 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1306 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1307 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1310 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1311 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1313 for( i = 0; i < 4; i++ )
1315 x264_me_t *l0m = &a->l0.me8x8[i];
1319 m.i_pixel = PIXEL_8x8;
1321 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1322 l0m->cost = INT_MAX;
1323 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1325 const int i_ref_cost = REF_COST( 0, i_ref );
1326 i_halfpel_thresh -= i_ref_cost;
1327 m.i_ref_cost = i_ref_cost;
1329 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1330 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1332 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1333 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1334 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1336 m.cost += i_ref_cost;
1337 i_halfpel_thresh += i_ref_cost;
1338 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1340 if( m.cost < l0m->cost )
1341 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1343 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1344 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1346 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1347 are effectively zero. */
1348 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1349 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1352 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1353 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1354 /* P_8x8 ref0 has no ref cost */
1355 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1356 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1357 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1358 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1359 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1362 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1364 const int i_ref = a->l0.me16x16.i_ref;
1365 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1366 uint8_t **p_fenc = h->mb.pic.p_fenc;
1368 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1371 /* XXX Needed for x264_mb_predict_mv */
1372 h->mb.i_partition = D_8x8;
1375 CP32( mvc[0], a->l0.me16x16.mv );
1377 for( i = 0; i < 4; i++ )
1379 x264_me_t *m = &a->l0.me8x8[i];
1383 m->i_pixel = PIXEL_8x8;
1384 m->i_ref_cost = i_ref_cost;
1386 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1387 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1388 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1390 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1391 x264_me_search( h, m, mvc, i_mvc );
1393 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1395 CP32( mvc[i_mvc], m->mv );
1399 m->cost += i_ref_cost;
1400 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1401 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1404 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1405 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1406 /* theoretically this should include 4*ref_cost,
1407 * but 3 seems a better approximation of cabac. */
1408 if( h->param.b_cabac )
1409 a->l0.i_cost8x8 -= i_ref_cost;
1410 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1411 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1414 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1417 uint8_t **p_fenc = h->mb.pic.p_fenc;
1418 ALIGNED_4( int16_t mvc[3][2] );
1421 /* XXX Needed for x264_mb_predict_mv */
1422 h->mb.i_partition = D_16x8;
1424 for( i = 0; i < 2; i++ )
1426 x264_me_t *l0m = &a->l0.me16x8[i];
1427 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1428 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1430 m.i_pixel = PIXEL_16x8;
1432 LOAD_FENC( &m, p_fenc, 0, 8*i );
1433 l0m->cost = INT_MAX;
1434 for( j = 0; j < i_ref8s; j++ )
1436 const int i_ref = ref8[j];
1437 const int i_ref_cost = REF_COST( 0, i_ref );
1438 m.i_ref_cost = i_ref_cost;
1440 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1441 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1442 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1443 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1445 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1446 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1448 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1449 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1450 x264_me_search( h, &m, mvc, 3 );
1452 m.cost += i_ref_cost;
1454 if( m.cost < l0m->cost )
1455 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1457 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1458 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1461 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1464 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1467 uint8_t **p_fenc = h->mb.pic.p_fenc;
1468 ALIGNED_4( int16_t mvc[3][2] );
1471 /* XXX Needed for x264_mb_predict_mv */
1472 h->mb.i_partition = D_8x16;
1474 for( i = 0; i < 2; i++ )
1476 x264_me_t *l0m = &a->l0.me8x16[i];
1477 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1478 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1480 m.i_pixel = PIXEL_8x16;
1482 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1483 l0m->cost = INT_MAX;
1484 for( j = 0; j < i_ref8s; j++ )
1486 const int i_ref = ref8[j];
1487 const int i_ref_cost = REF_COST( 0, i_ref );
1488 m.i_ref_cost = i_ref_cost;
1490 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1491 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1492 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1494 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1495 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1497 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1498 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1499 x264_me_search( h, &m, mvc, 3 );
1501 m.cost += i_ref_cost;
1503 if( m.cost < l0m->cost )
1504 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1506 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1507 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1510 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1513 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1515 ALIGNED_8( uint8_t pix1[16*8] );
1516 uint8_t *pix2 = pix1+8;
1517 const int i_stride = h->mb.pic.i_stride[1];
1518 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1519 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1520 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1521 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1522 x264_weight_t *weight = h->sh.weight[i_ref];
1524 #define CHROMA4x4MC( width, height, me, x, y ) \
1525 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1526 if( weight[1].weightfn ) \
1527 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1528 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1529 if( weight[2].weightfn ) \
1530 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1533 if( pixel == PIXEL_4x4 )
1535 x264_me_t *m = a->l0.me4x4[i8x8];
1536 CHROMA4x4MC( 2,2, m[0], 0,0 );
1537 CHROMA4x4MC( 2,2, m[1], 2,0 );
1538 CHROMA4x4MC( 2,2, m[2], 0,2 );
1539 CHROMA4x4MC( 2,2, m[3], 2,2 );
1541 else if( pixel == PIXEL_8x4 )
1543 x264_me_t *m = a->l0.me8x4[i8x8];
1544 CHROMA4x4MC( 4,2, m[0], 0,0 );
1545 CHROMA4x4MC( 4,2, m[1], 0,2 );
1549 x264_me_t *m = a->l0.me4x8[i8x8];
1550 CHROMA4x4MC( 2,4, m[0], 0,0 );
1551 CHROMA4x4MC( 2,4, m[1], 2,0 );
1554 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1555 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1558 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1560 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1561 uint8_t **p_fenc = h->mb.pic.p_fenc;
1562 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1565 /* XXX Needed for x264_mb_predict_mv */
1566 h->mb.i_partition = D_8x8;
1568 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1570 const int idx = 4*i8x8 + i4x4;
1571 const int x4 = block_idx_x[idx];
1572 const int y4 = block_idx_y[idx];
1573 const int i_mvc = (i4x4 == 0);
1575 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1577 m->i_pixel = PIXEL_4x4;
1579 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1580 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1581 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1583 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1584 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1586 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1588 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1589 a->l0.me4x4[i8x8][1].cost +
1590 a->l0.me4x4[i8x8][2].cost +
1591 a->l0.me4x4[i8x8][3].cost +
1592 REF_COST( 0, i_ref ) +
1593 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1594 if( h->mb.b_chroma_me )
1595 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1598 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1600 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1601 uint8_t **p_fenc = h->mb.pic.p_fenc;
1602 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1605 /* XXX Needed for x264_mb_predict_mv */
1606 h->mb.i_partition = D_8x8;
1608 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1610 const int idx = 4*i8x8 + 2*i8x4;
1611 const int x4 = block_idx_x[idx];
1612 const int y4 = block_idx_y[idx];
1613 const int i_mvc = (i8x4 == 0);
1615 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1617 m->i_pixel = PIXEL_8x4;
1619 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1620 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1621 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1623 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1624 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1626 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1628 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1629 REF_COST( 0, i_ref ) +
1630 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1631 if( h->mb.b_chroma_me )
1632 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1635 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1637 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1638 uint8_t **p_fenc = h->mb.pic.p_fenc;
1639 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1642 /* XXX Needed for x264_mb_predict_mv */
1643 h->mb.i_partition = D_8x8;
1645 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1647 const int idx = 4*i8x8 + i4x8;
1648 const int x4 = block_idx_x[idx];
1649 const int y4 = block_idx_y[idx];
1650 const int i_mvc = (i4x8 == 0);
1652 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1654 m->i_pixel = PIXEL_4x8;
1656 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1657 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1658 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1660 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1661 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1663 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1665 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1666 REF_COST( 0, i_ref ) +
1667 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1668 if( h->mb.b_chroma_me )
1669 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1672 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1674 /* Assumes that fdec still contains the results of
1675 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1677 uint8_t **p_fenc = h->mb.pic.p_fenc;
1678 uint8_t **p_fdec = h->mb.pic.p_fdec;
1681 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1682 for( i = 0; i < 4; i++ )
1684 const int x = (i&1)*8;
1685 const int y = (i>>1)*8;
1686 a->i_cost16x16direct +=
1687 a->i_cost8x8direct[i] =
1688 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1691 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1695 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1697 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1698 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1699 uint8_t *src0, *src1;
1700 int stride0 = 16, stride1 = 16;
1704 ALIGNED_4( int16_t mvc[9][2] );
1705 int i_halfpel_thresh = INT_MAX;
1706 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1708 /* 16x16 Search on all ref frame */
1709 m.i_pixel = PIXEL_16x16;
1710 m.weight = weight_none;
1712 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1715 a->l0.me16x16.cost = INT_MAX;
1716 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1718 /* search with ref */
1719 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1720 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1721 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1722 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1725 m.cost += REF_COST( 0, i_ref );
1727 if( m.cost < a->l0.me16x16.cost )
1729 a->l0.i_ref = i_ref;
1730 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1733 /* save mv for predicting neighbors */
1734 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1736 a->l0.me16x16.i_ref = a->l0.i_ref;
1738 /* subtract ref cost, so we don't have to add it for the other MB types */
1739 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1742 i_halfpel_thresh = INT_MAX;
1743 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1744 a->l1.me16x16.cost = INT_MAX;
1745 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1747 /* search with ref */
1748 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1749 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1750 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1751 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1754 m.cost += REF_COST( 1, i_ref );
1756 if( m.cost < a->l1.me16x16.cost )
1758 a->l1.i_ref = i_ref;
1759 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1762 /* save mv for predicting neighbors */
1763 CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1765 a->l1.me16x16.i_ref = a->l1.i_ref;
1767 /* subtract ref cost, so we don't have to add it for the other MB types */
1768 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1770 /* Set global ref, needed for other modes? */
1771 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1772 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1774 /* get cost of BI mode */
1775 src0 = h->mc.get_ref( pix0, &stride0,
1776 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1777 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
1778 src1 = h->mc.get_ref( pix1, &stride1,
1779 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1780 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
1782 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1784 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1785 + REF_COST( 0, a->l0.i_ref )
1786 + REF_COST( 1, a->l1.i_ref )
1787 + a->l0.me16x16.cost_mv
1788 + a->l1.me16x16.cost_mv;
1791 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1792 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1793 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1796 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1798 const int x = 2*(i%2);
1799 const int y = 2*(i/2);
1801 switch( h->mb.i_sub_partition[i] )
1804 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1807 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1808 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1811 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1812 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1815 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1816 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1817 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1818 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1821 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1826 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1827 if( x264_mb_partition_listX_table[0][part] ) \
1829 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1830 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1834 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1835 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1837 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1839 if( x264_mb_partition_listX_table[1][part] ) \
1841 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1842 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1846 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1847 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1849 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1852 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1856 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1858 x264_mb_load_mv_direct8x8( h, i );
1861 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1862 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1863 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1868 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1871 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1873 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1875 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1877 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1881 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1883 uint8_t **p_fref[2] =
1884 { h->mb.pic.p_fref[0][a->l0.i_ref],
1885 h->mb.pic.p_fref[1][a->l1.i_ref] };
1886 ALIGNED_8( uint8_t pix[2][8*8] );
1889 /* XXX Needed for x264_mb_predict_mv */
1890 h->mb.i_partition = D_8x8;
1894 for( i = 0; i < 4; i++ )
1899 int i_part_cost_bi = 0;
1900 int stride[2] = {8,8};
1903 for( l = 0; l < 2; l++ )
1905 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1906 x264_me_t *m = &lX->me8x8[i];
1908 m->i_pixel = PIXEL_8x8;
1910 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1911 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1913 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1914 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1916 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1919 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1920 m->mv[0], m->mv[1], 8, 8, weight_none );
1921 i_part_cost_bi += m->cost_mv;
1922 /* FIXME: ref cost */
1924 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1925 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1926 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1927 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1928 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1930 i_part_cost = a->l0.me8x8[i].cost;
1931 h->mb.i_sub_partition[i] = D_L0_8x8;
1932 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1933 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1934 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1935 a->i_cost8x8bi += i_part_cost;
1937 /* XXX Needed for x264_mb_predict_mv */
1938 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1942 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1945 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1947 uint8_t **p_fref[2] =
1948 { h->mb.pic.p_fref[0][a->l0.i_ref],
1949 h->mb.pic.p_fref[1][a->l1.i_ref] };
1950 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1951 ALIGNED_4( int16_t mvc[2][2] );
1954 h->mb.i_partition = D_16x8;
1955 a->i_cost16x8bi = 0;
1957 for( i = 0; i < 2; i++ )
1960 int i_part_cost_bi = 0;
1961 int stride[2] = {16,16};
1964 /* TODO: check only the list(s) that were used in b8x8? */
1965 for( l = 0; l < 2; l++ )
1967 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1968 x264_me_t *m = &lX->me16x8[i];
1970 m->i_pixel = PIXEL_16x8;
1972 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1973 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1975 CP32( mvc[0], lX->me8x8[2*i].mv );
1976 CP32( mvc[1], lX->me8x8[2*i+1].mv );
1978 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1979 x264_me_search( h, m, mvc, 2 );
1982 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1983 m->mv[0], m->mv[1], 16, 8, weight_none );
1984 /* FIXME: ref cost */
1985 i_part_cost_bi += m->cost_mv;
1987 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1988 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1990 i_part_cost = a->l0.me16x8[i].cost;
1991 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1992 if( a->l1.me16x8[i].cost < i_part_cost )
1994 i_part_cost = a->l1.me16x8[i].cost;
1995 a->i_mb_partition16x8[i] = D_L1_8x8;
1997 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1999 i_part_cost = i_part_cost_bi;
2000 a->i_mb_partition16x8[i] = D_BI_8x8;
2002 a->i_cost16x8bi += i_part_cost;
2004 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2008 a->i_mb_type16x8 = B_L0_L0
2009 + (a->i_mb_partition16x8[0]>>2) * 3
2010 + (a->i_mb_partition16x8[1]>>2);
2011 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2014 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2016 uint8_t **p_fref[2] =
2017 { h->mb.pic.p_fref[0][a->l0.i_ref],
2018 h->mb.pic.p_fref[1][a->l1.i_ref] };
2019 ALIGNED_8( uint8_t pix[2][8*16] );
2020 ALIGNED_4( int16_t mvc[2][2] );
2023 h->mb.i_partition = D_8x16;
2024 a->i_cost8x16bi = 0;
2026 for( i = 0; i < 2; i++ )
2029 int i_part_cost_bi = 0;
2030 int stride[2] = {8,8};
2033 for( l = 0; l < 2; l++ )
2035 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2036 x264_me_t *m = &lX->me8x16[i];
2038 m->i_pixel = PIXEL_8x16;
2040 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2041 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2043 CP32( mvc[0], lX->me8x8[i].mv );
2044 CP32( mvc[1], lX->me8x8[i+2].mv );
2046 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2047 x264_me_search( h, m, mvc, 2 );
2050 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2051 m->mv[0], m->mv[1], 8, 16, weight_none );
2052 /* FIXME: ref cost */
2053 i_part_cost_bi += m->cost_mv;
2056 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2057 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2059 i_part_cost = a->l0.me8x16[i].cost;
2060 a->i_mb_partition8x16[i] = D_L0_8x8;
2061 if( a->l1.me8x16[i].cost < i_part_cost )
2063 i_part_cost = a->l1.me8x16[i].cost;
2064 a->i_mb_partition8x16[i] = D_L1_8x8;
2066 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2068 i_part_cost = i_part_cost_bi;
2069 a->i_mb_partition8x16[i] = D_BI_8x8;
2071 a->i_cost8x16bi += i_part_cost;
2073 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2077 a->i_mb_type8x16 = B_L0_L0
2078 + (a->i_mb_partition8x16[0]>>2) * 3
2079 + (a->i_mb_partition8x16[1]>>2);
2080 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2083 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2085 int thresh = i_satd * 5/4;
2087 h->mb.i_type = P_L0;
2088 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2090 h->mb.i_partition = D_16x16;
2091 x264_analyse_update_cache( h, a );
2092 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2095 if( a->l0.i_cost16x8 <= thresh )
2097 h->mb.i_partition = D_16x8;
2098 x264_analyse_update_cache( h, a );
2099 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2102 a->l0.i_cost16x8 = COST_MAX;
2104 if( a->l0.i_cost8x16 <= thresh )
2106 h->mb.i_partition = D_8x16;
2107 x264_analyse_update_cache( h, a );
2108 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2111 a->l0.i_cost8x16 = COST_MAX;
2113 if( a->l0.i_cost8x8 <= thresh )
2115 h->mb.i_type = P_8x8;
2116 h->mb.i_partition = D_8x8;
2117 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2120 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2121 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2122 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2123 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2124 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2125 * for future blocks are those left over from previous RDO calls. */
2126 for( i = 0; i < 4; i++ )
2128 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2129 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2130 int subtype, btype = D_L0_8x8;
2131 uint64_t bcost = COST_MAX64;
2132 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2135 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2137 h->mb.i_sub_partition[i] = subtype;
2138 x264_mb_cache_mv_p8x8( h, a, i );
2139 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2140 COPY2_IF_LT( bcost, cost, btype, subtype );
2142 h->mb.i_sub_partition[i] = btype;
2143 x264_mb_cache_mv_p8x8( h, a, i );
2147 x264_analyse_update_cache( h, a );
2148 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2151 a->l0.i_cost8x8 = COST_MAX;
2154 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2156 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2158 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2160 h->mb.i_type = B_DIRECT;
2161 /* Assumes direct/skip MC is still in fdec */
2162 /* Requires b-rdo to be done before intra analysis */
2163 h->mb.b_skip_mc = 1;
2164 x264_analyse_update_cache( h, a );
2165 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2166 h->mb.b_skip_mc = 0;
2169 //FIXME not all the update_cache calls are needed
2170 h->mb.i_partition = D_16x16;
2172 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2174 h->mb.i_type = B_L0_L0;
2175 x264_analyse_update_cache( h, a );
2176 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2180 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2182 h->mb.i_type = B_L1_L1;
2183 x264_analyse_update_cache( h, a );
2184 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2188 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2190 h->mb.i_type = B_BI_BI;
2191 x264_analyse_update_cache( h, a );
2192 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2196 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2198 h->mb.i_type = B_8x8;
2199 h->mb.i_partition = D_8x8;
2200 x264_analyse_update_cache( h, a );
2201 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2202 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2206 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2208 h->mb.i_type = a->i_mb_type16x8;
2209 h->mb.i_partition = D_16x8;
2210 x264_analyse_update_cache( h, a );
2211 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2215 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2217 h->mb.i_type = a->i_mb_type8x16;
2218 h->mb.i_partition = D_8x16;
2219 x264_analyse_update_cache( h, a );
2220 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2224 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2226 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2229 if( IS_INTRA(h->mb.i_type) )
2232 switch( h->mb.i_partition )
2235 if( h->mb.i_type == B_BI_BI )
2236 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2239 for( i=0; i<2; i++ )
2240 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2241 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2244 for( i=0; i<2; i++ )
2245 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2246 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2249 for( i=0; i<4; i++ )
2250 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2251 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2256 static inline void x264_mb_analyse_transform( x264_t *h )
2258 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2260 int i_cost4, i_cost8;
2261 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2264 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2265 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2266 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2267 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2269 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2270 h->mb.b_skip_mc = 1;
2274 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2276 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2279 x264_analyse_update_cache( h, a );
2280 h->mb.b_transform_8x8 ^= 1;
2281 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2282 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2284 if( *i_rd >= i_rd8 )
2287 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2291 h->mb.b_transform_8x8 ^= 1;
2295 /* Rate-distortion optimal QP selection.
2296 * FIXME: More than half of the benefit of this function seems to be
2297 * in the way it improves the coding of chroma DC (by decimating or
2298 * finding a better way to code a single DC coefficient.)
2299 * There must be a more efficient way to get that portion of the benefit
2300 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2302 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2304 int bcost, cost, direction, failures, prevcost, origcost;
2305 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2306 int last_qp_tried = 0;
2307 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2309 /* If CBP is already zero, don't raise the quantizer any higher. */
2310 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2312 /* Without psy-RD, require monotonicity when moving quant away from previous
2313 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2314 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2315 * allow 2 failures when moving quant towards previous quant.
2316 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2317 int threshold = (!!h->mb.i_psy_rd);
2318 /* Raise the threshold for failures if we're moving towards the last QP. */
2319 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2320 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2322 h->mb.i_qp = orig_qp;
2324 prevcost = origcost;
2325 h->mb.i_qp += direction;
2326 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2328 if( h->mb.i_last_qp == h->mb.i_qp )
2330 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2331 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2332 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2334 /* We can't assume that the costs are monotonic over QPs.
2335 * Tie case-as-failure seems to give better results. */
2336 if( cost < prevcost )
2342 if( failures > threshold )
2344 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2346 h->mb.i_qp += direction;
2350 /* Always try the last block's QP. */
2351 if( !last_qp_tried )
2353 h->mb.i_qp = h->mb.i_last_qp;
2354 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2355 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2356 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2360 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2362 /* Check transform again; decision from before may no longer be optimal. */
2363 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2364 x264_mb_transform_8x8_allowed( h ) )
2366 h->mb.b_transform_8x8 ^= 1;
2367 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2369 h->mb.b_transform_8x8 ^= 1;
2373 /*****************************************************************************
2374 * x264_macroblock_analyse:
2375 *****************************************************************************/
2376 void x264_macroblock_analyse( x264_t *h )
2378 x264_mb_analysis_t analysis;
2379 int i_cost = COST_MAX;
2382 h->mb.i_qp = x264_ratecontrol_qp( h );
2383 if( h->param.rc.i_aq_mode )
2385 x264_adaptive_quant( h );
2386 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2387 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2388 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2389 h->mb.i_qp = h->mb.i_last_qp;
2392 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2394 /*--------------------------- Do the analysis ---------------------------*/
2395 if( h->sh.i_type == SLICE_TYPE_I )
2397 if( analysis.i_mbrd )
2398 x264_mb_cache_fenc_satd( h );
2399 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2400 if( analysis.i_mbrd )
2401 x264_intra_rd( h, &analysis, COST_MAX );
2403 i_cost = analysis.i_satd_i16x16;
2404 h->mb.i_type = I_16x16;
2405 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2406 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2407 if( analysis.i_satd_pcm < i_cost )
2408 h->mb.i_type = I_PCM;
2410 else if( analysis.i_mbrd >= 2 )
2411 x264_intra_rd_refine( h, &analysis );
2413 else if( h->sh.i_type == SLICE_TYPE_P )
2417 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2419 /* Fast P_SKIP detection */
2420 analysis.b_try_pskip = 0;
2421 if( h->param.analyse.b_fast_pskip )
2423 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2424 // FIXME don't need to check this if the reference frame is done
2426 else if( h->param.analyse.i_subpel_refine >= 3 )
2427 analysis.b_try_pskip = 1;
2428 else if( h->mb.i_mb_type_left == P_SKIP ||
2429 h->mb.i_mb_type_top == P_SKIP ||
2430 h->mb.i_mb_type_topleft == P_SKIP ||
2431 h->mb.i_mb_type_topright == P_SKIP )
2432 b_skip = x264_macroblock_probe_pskip( h );
2435 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2439 h->mb.i_type = P_SKIP;
2440 h->mb.i_partition = D_16x16;
2441 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2445 const unsigned int flags = h->param.analyse.inter;
2449 int i_satd_inter, i_satd_intra;
2451 x264_mb_analyse_load_costs( h, &analysis );
2453 x264_mb_analyse_inter_p16x16( h, &analysis );
2455 if( h->mb.i_type == P_SKIP )
2458 if( flags & X264_ANALYSE_PSUB16x16 )
2460 if( h->param.analyse.b_mixed_references )
2461 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2463 x264_mb_analyse_inter_p8x8( h, &analysis );
2466 /* Select best inter mode */
2468 i_partition = D_16x16;
2469 i_cost = analysis.l0.me16x16.cost;
2471 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2472 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2475 i_partition = D_8x8;
2476 i_cost = analysis.l0.i_cost8x8;
2479 if( flags & X264_ANALYSE_PSUB8x8 )
2481 for( i = 0; i < 4; i++ )
2483 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2484 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2486 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2487 h->mb.i_sub_partition[i] = D_L0_4x4;
2489 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2490 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2491 h->mb.i_sub_partition[i], D_L0_8x4 );
2493 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2494 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2495 h->mb.i_sub_partition[i], D_L0_4x8 );
2497 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2499 x264_mb_cache_mv_p8x8( h, &analysis, i );
2501 analysis.l0.i_cost8x8 = i_cost;
2505 /* Now do 16x8/8x16 */
2506 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2507 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2508 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2510 x264_mb_analyse_inter_p16x8( h, &analysis );
2511 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2513 x264_mb_analyse_inter_p8x16( h, &analysis );
2514 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2517 h->mb.i_partition = i_partition;
2520 //FIXME mb_type costs?
2521 if( analysis.i_mbrd )
2525 else if( i_partition == D_16x16 )
2527 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2528 i_cost = analysis.l0.me16x16.cost;
2530 else if( i_partition == D_16x8 )
2532 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2533 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2534 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2536 else if( i_partition == D_8x16 )
2538 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2539 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2540 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2542 else if( i_partition == D_8x8 )
2546 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2548 switch( h->mb.i_sub_partition[i8x8] )
2551 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2552 i_cost += analysis.l0.me8x8[i8x8].cost;
2555 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2556 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2557 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2558 analysis.l0.me8x4[i8x8][1].cost;
2561 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2562 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2563 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2564 analysis.l0.me4x8[i8x8][1].cost;
2568 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2569 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2570 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2571 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2572 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2573 analysis.l0.me4x4[i8x8][1].cost +
2574 analysis.l0.me4x4[i8x8][2].cost +
2575 analysis.l0.me4x4[i8x8][3].cost;
2578 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2584 if( h->mb.b_chroma_me )
2586 x264_mb_analyse_intra_chroma( h, &analysis );
2587 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2588 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2589 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2590 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2593 x264_mb_analyse_intra( h, &analysis, i_cost );
2595 i_satd_inter = i_cost;
2596 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2597 analysis.i_satd_i8x8,
2598 analysis.i_satd_i4x4 );
2600 if( analysis.i_mbrd )
2602 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2604 i_partition = D_16x16;
2605 i_cost = analysis.l0.i_rd16x16;
2606 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2607 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2608 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2609 h->mb.i_type = i_type;
2610 h->mb.i_partition = i_partition;
2611 if( i_cost < COST_MAX )
2612 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2613 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2616 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2617 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2618 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2619 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2621 h->mb.i_type = i_type;
2623 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2625 if( IS_INTRA( h->mb.i_type ) )
2627 x264_intra_rd_refine( h, &analysis );
2629 else if( i_partition == D_16x16 )
2631 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2632 analysis.l0.me16x16.cost = i_cost;
2633 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2635 else if( i_partition == D_16x8 )
2637 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2638 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2639 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2640 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2641 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2642 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2644 else if( i_partition == D_8x16 )
2646 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2647 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2648 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2649 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2650 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2651 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2653 else if( i_partition == D_8x8 )
2656 x264_analyse_update_cache( h, &analysis );
2657 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2659 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2661 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2663 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2665 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2666 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2668 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2670 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2671 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2673 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2675 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2676 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2677 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2678 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2685 else if( h->sh.i_type == SLICE_TYPE_B )
2687 int i_bskip_cost = COST_MAX;
2690 if( analysis.i_mbrd )
2691 x264_mb_cache_fenc_satd( h );
2693 h->mb.i_type = B_SKIP;
2694 if( h->mb.b_direct_auto_write )
2696 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2697 for( i = 0; i < 2; i++ )
2700 h->sh.b_direct_spatial_mv_pred ^= 1;
2701 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2702 if( analysis.b_direct_available )
2707 b_skip = x264_macroblock_probe_bskip( h );
2709 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2716 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2718 if( analysis.b_direct_available )
2720 if( !h->mb.b_direct_auto_write )
2722 if( analysis.i_mbrd )
2724 i_bskip_cost = ssd_mb( h );
2725 /* 6 = minimum cavlc cost of a non-skipped MB */
2726 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2728 else if( !h->mb.b_direct_auto_write )
2730 /* Conditioning the probe on neighboring block types
2731 * doesn't seem to help speed or quality. */
2732 b_skip = x264_macroblock_probe_bskip( h );
2738 const unsigned int flags = h->param.analyse.inter;
2742 h->mb.b_skip_mc = 0;
2744 x264_mb_analyse_load_costs( h, &analysis );
2746 /* select best inter mode */
2747 /* direct must be first */
2748 if( analysis.b_direct_available )
2749 x264_mb_analyse_inter_direct( h, &analysis );
2751 x264_mb_analyse_inter_b16x16( h, &analysis );
2754 i_partition = D_16x16;
2755 i_cost = analysis.l0.me16x16.cost;
2756 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2757 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2758 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2760 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2762 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2763 if( i_bskip_cost < analysis.i_rd16x16direct &&
2764 i_bskip_cost < analysis.i_rd16x16bi &&
2765 i_bskip_cost < analysis.l0.i_rd16x16 &&
2766 i_bskip_cost < analysis.l1.i_rd16x16 )
2768 h->mb.i_type = B_SKIP;
2769 x264_analyse_update_cache( h, &analysis );
2774 if( flags & X264_ANALYSE_BSUB16x16 )
2776 x264_mb_analyse_inter_b8x8( h, &analysis );
2777 if( analysis.i_cost8x8bi < i_cost )
2780 i_partition = D_8x8;
2781 i_cost = analysis.i_cost8x8bi;
2783 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2784 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2786 x264_mb_analyse_inter_b16x8( h, &analysis );
2787 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2788 i_type, analysis.i_mb_type16x8,
2789 i_partition, D_16x8 );
2791 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2792 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2794 x264_mb_analyse_inter_b8x16( h, &analysis );
2795 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2796 i_type, analysis.i_mb_type8x16,
2797 i_partition, D_8x16 );
2802 if( analysis.i_mbrd )
2807 else if( i_partition == D_16x16 )
2809 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2810 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2811 if( i_type == B_L0_L0 )
2813 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2814 i_cost = analysis.l0.me16x16.cost
2815 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2817 else if( i_type == B_L1_L1 )
2819 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2820 i_cost = analysis.l1.me16x16.cost
2821 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2823 else if( i_type == B_BI_BI )
2825 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2826 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2829 else if( i_partition == D_16x8 )
2831 for( i=0; i<2; i++ )
2833 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2834 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2835 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2836 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2839 else if( i_partition == D_8x16 )
2841 for( i=0; i<2; i++ )
2843 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2844 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2845 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2846 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2849 else if( i_partition == D_8x8 )
2851 for( i=0; i<4; i++ )
2854 int i_part_cost_old;
2856 int i_part_type = h->mb.i_sub_partition[i];
2857 int b_bidir = (i_part_type == D_BI_8x8);
2859 if( i_part_type == D_DIRECT_8x8 )
2861 if( x264_mb_partition_listX_table[0][i_part_type] )
2863 m = &analysis.l0.me8x8[i];
2864 i_part_cost_old = m->cost;
2865 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2866 m->cost -= i_type_cost;
2867 x264_me_refine_qpel( h, m );
2869 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2871 if( x264_mb_partition_listX_table[1][i_part_type] )
2873 m = &analysis.l1.me8x8[i];
2874 i_part_cost_old = m->cost;
2875 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2876 m->cost -= i_type_cost;
2877 x264_me_refine_qpel( h, m );
2879 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2881 /* TODO: update mvp? */
2885 i_satd_inter = i_cost;
2887 if( analysis.i_mbrd )
2889 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2891 i_cost = i_bskip_cost;
2892 i_partition = D_16x16;
2893 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2894 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2895 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2896 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2897 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2898 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2899 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2901 h->mb.i_type = i_type;
2902 h->mb.i_partition = i_partition;
2905 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2907 if( analysis.i_mbrd )
2909 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2910 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2913 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2914 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2915 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2916 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2918 h->mb.i_type = i_type;
2919 h->mb.i_partition = i_partition;
2921 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2922 x264_intra_rd_refine( h, &analysis );
2923 if( h->mb.i_subpel_refine >= 5 )
2924 x264_refine_bidir( h, &analysis );
2926 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2928 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2929 x264_analyse_update_cache( h, &analysis );
2931 if( i_partition == D_16x16 )
2933 if( i_type == B_L0_L0 )
2935 analysis.l0.me16x16.cost = i_cost;
2936 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2938 else if( i_type == B_L1_L1 )
2940 analysis.l1.me16x16.cost = i_cost;
2941 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2943 else if( i_type == B_BI_BI )
2944 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2946 else if( i_partition == D_16x8 )
2948 for( i = 0; i < 2; i++ )
2950 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2951 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2952 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2953 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2954 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2955 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2956 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2959 else if( i_partition == D_8x16 )
2961 for( i = 0; i < 2; i++ )
2963 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2964 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2965 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2966 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2967 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2968 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2969 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2972 else if( i_partition == D_8x8 )
2974 for( i = 0; i < 4; i++ )
2976 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2977 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2978 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2979 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2980 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2981 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2988 x264_analyse_update_cache( h, &analysis );
2990 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
2991 * without realizing it. Check for this and account for it if necessary. */
2992 if( analysis.i_mbrd >= 2 )
2994 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
2995 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
2996 int list = check_mv_lists[h->mb.i_type] - 1;
2997 if( list >= 0 && h->mb.i_partition != D_16x16 &&
2998 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
2999 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3000 h->mb.i_partition = D_16x16;
3003 if( !analysis.i_mbrd )
3004 x264_mb_analyse_transform( h );
3006 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3007 x264_mb_analyse_qp_rd( h, &analysis );
3009 h->mb.b_trellis = h->param.analyse.i_trellis;
3010 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3011 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3012 x264_psy_trellis_init( h, 0 );
3013 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3014 h->mb.i_skip_intra = 0;
3017 /*-------------------- Update MB from the analysis ----------------------*/
3018 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3022 switch( h->mb.i_type )
3025 for( i = 0; i < 16; i++ )
3026 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3028 x264_mb_analyse_intra_chroma( h, a );
3031 for( i = 0; i < 4; i++ )
3032 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3034 x264_mb_analyse_intra_chroma( h, a );
3037 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3038 x264_mb_analyse_intra_chroma( h, a );
3045 switch( h->mb.i_partition )
3048 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3049 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3053 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3054 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3055 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3056 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3060 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3061 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3062 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3063 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3067 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3073 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3074 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3075 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3076 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3077 for( i = 0; i < 4; i++ )
3078 x264_mb_cache_mv_p8x8( h, a, i );
3083 h->mb.i_partition = D_16x16;
3084 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3085 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3091 x264_mb_load_mv_direct8x8( h, 0 );
3092 x264_mb_load_mv_direct8x8( h, 1 );
3093 x264_mb_load_mv_direct8x8( h, 2 );
3094 x264_mb_load_mv_direct8x8( h, 3 );
3098 /* optimize: cache might not need to be rewritten */
3099 for( i = 0; i < 4; i++ )
3100 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3103 default: /* the rest of the B types */
3104 switch( h->mb.i_partition )
3107 switch( h->mb.i_type )
3110 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3111 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3113 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3114 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3115 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3118 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3119 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3120 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3122 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3123 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3126 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3127 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3129 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3130 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3135 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3136 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3139 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3140 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3143 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3149 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3152 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3155 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3158 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3159 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3161 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3162 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3163 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3164 h->mb.cache.mv[l][x264_scan8[15]][0],
3165 h->mb.cache.mv[l][x264_scan8[15]][1] );
3166 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3167 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3168 fprintf(stderr, "completed: %d \n", completed );
3169 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3170 x264_mb_analyse_intra( h, a, COST_MAX );
3171 h->mb.i_type = I_16x16;
3172 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3173 x264_mb_analyse_intra_chroma( h, a );
3180 #include "slicetype.c"