]> git.sesse.net Git - x264/blobdiff - encoder/encoder.c
Clip log2_max_frame_num
[x264] / encoder / encoder.c
index 03faab47b525151f0d3e30d522b1e3eb7dda4abe..080a259e28fe4ed052d8a83477b2ab6a93dff929 100644 (file)
@@ -88,7 +88,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
     sh->pps = pps;
 
     sh->i_first_mb  = 0;
-    sh->i_last_mb   = h->sps->i_mb_width * h->sps->i_mb_height;
+    sh->i_last_mb   = h->mb.i_mb_count - 1;
     sh->i_pps_id    = pps->i_id;
 
     sh->i_frame_num = i_frame;
@@ -175,12 +175,12 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
     bs_write_ue( s, sh->i_type + 5 );   /* same type things */
     bs_write_ue( s, sh->i_pps_id );
-    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num );
+    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num & ((1<<sh->sps->i_log2_max_frame_num)-1) );
 
     if( !sh->sps->b_frame_mbs_only )
     {
         bs_write1( s, sh->b_field_pic );
-        if ( sh->b_field_pic )
+        if( sh->b_field_pic )
             bs_write1( s, sh->b_bottom_field );
     }
 
@@ -191,7 +191,7 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
     if( sh->sps->i_poc_type == 0 )
     {
-        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb );
+        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb & ((1<<sh->sps->i_log2_max_poc_lsb)-1) );
         if( sh->pps->b_pic_order && !sh->b_field_pic )
         {
             bs_write_se( s, sh->i_delta_poc_bottom );
@@ -338,9 +338,9 @@ fail:
 static int x264_validate_parameters( x264_t *h )
 {
 #ifdef HAVE_MMX
-    if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+    if( !(x264_cpu_detect() & X264_CPU_SSE) )
     {
-        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
         x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
         return -1;
     }
@@ -364,7 +364,7 @@ static int x264_validate_parameters( x264_t *h )
         return -1;
     }
 
-    if( h->param.i_threads == 0 )
+    if( h->param.i_threads == X264_THREADS_AUTO )
         h->param.i_threads = x264_cpu_num_processors() * 3/2;
     h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
     if( h->param.i_threads > 1 )
@@ -466,6 +466,23 @@ static int x264_validate_parameters( x264_t *h )
                   h->param.i_width, h->param.i_height );
     }
 
+    int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
+    h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
+    h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
+    h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+    if( h->param.b_interlaced && h->param.i_slice_max_size )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+        h->param.i_slice_max_size = 0;
+    }
+    if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+        h->param.i_slice_max_mbs = 0;
+    }
+    if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
+        h->param.i_slice_count = 0;
+
     h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
     if( h->param.i_keyint_max <= 0 )
         h->param.i_keyint_max = 1;
@@ -478,6 +495,8 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     }
     h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_BFRAME_MAX );
+    if( h->param.i_keyint_max == 1 )
+        h->param.i_bframe = 0;
     h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
     h->param.b_bframe_pyramid = h->param.b_bframe_pyramid && h->param.i_bframe > 1;
     if( !h->param.i_bframe )
@@ -494,12 +513,19 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.i_lookahead = X264_MIN( h->param.rc.i_lookahead, X264_MAX( h->param.i_keyint_max, bufsize*fps ) );
     }
 
+    h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 );
+    if( !h->param.rc.i_lookahead || h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 )
+        h->param.rc.b_mb_tree = 0;
     if( h->param.rc.b_stat_read )
         h->param.rc.i_lookahead = 0;
-    else if( !h->param.rc.i_lookahead )
-        h->param.rc.b_mb_tree = 0;
-    if( h->param.rc.f_qcompress == 1 )
-        h->param.rc.b_mb_tree = 0;
+#ifdef HAVE_PTHREAD
+    if( h->param.i_sync_lookahead )
+        h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
+    if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
+        h->param.i_sync_lookahead = 0;
+#else
+    h->param.i_sync_lookahead = 0;
+#endif
 
     h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
                                 && h->param.i_bframe
@@ -717,11 +743,11 @@ static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
 /****************************************************************************
  * x264_encoder_open:
  ****************************************************************************/
-x264_t *x264_encoder_open   ( x264_param_t *param )
+x264_t *x264_encoder_open( x264_param_t *param )
 {
     x264_t *h;
     char buf[1000], *p;
-    int i;
+    int i, qp, i_slicetype_length;
 
     CHECKED_MALLOCZERO( h, sizeof(x264_t) );
 
@@ -748,7 +774,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
 
     /* Init x264_t */
-    h->i_frame = 0;
+    h->i_frame = -1;
     h->i_frame_num = 0;
     h->i_idr_pic_id = 0;
 
@@ -756,10 +782,12 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
 
     h->pps = &h->pps_array[0];
-    x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps);
+    x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps );
 
     x264_validate_levels( h, 1 );
 
+    h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
+
     if( x264_cqm_init( h ) < 0 )
         goto fail;
 
@@ -772,8 +800,10 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         h->frames.i_delay = h->param.i_bframe;
     if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
         h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
+    i_slicetype_length = h->frames.i_delay;
     h->frames.i_delay += h->param.i_threads - 1;
     h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
+    h->frames.i_delay += h->param.i_sync_lookahead;
 
     h->frames.i_max_ref0 = h->param.i_frame_reference;
     h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
@@ -784,19 +814,22 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
           || h->param.i_bframe_adaptive
           || h->param.i_scenecut_threshold
           || h->param.rc.b_mb_tree );
-    h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+    h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
     h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 
     h->frames.i_last_idr = - h->param.i_keyint_max;
     h->frames.i_input    = 0;
-    h->frames.last_nonb  = NULL;
+
+    CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
+    /* Allocate room for max refs plus a few extra just in case. */
+    CHECKED_MALLOCZERO( h->frames.unused[1], (h->param.i_threads + 20) * sizeof(x264_frame_t *) );
+    CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
+                        + h->param.i_threads + 3) * sizeof(x264_frame_t *) );
 
     h->i_ref0 = 0;
     h->i_ref1 = 0;
 
-    h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
-
-    x264_rdo_init( );
+    x264_rdo_init();
 
     /* init CPU functions */
     x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
@@ -835,32 +868,49 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         p += sprintf( p, " none!" );
     x264_log( h, X264_LOG_INFO, "%s\n", buf );
 
+    for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+        if( x264_analyse_init_costs( h, qp ) )
+            goto fail;
+    if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
+        goto fail;
+    if( h->cost_mv[1][2013] != 24 )
+    {
+        x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
+        goto fail;
+    }
+
     h->out.i_nal = 0;
     h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
         * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
           : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
 
+    CHECKED_MALLOC( h->nal_buffer, h->out.i_bitstream * 3/2 + 4 );
+    h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
+
     h->thread[0] = h;
     h->i_thread_num = 0;
-    for( i = 1; i < h->param.i_threads; i++ )
+    for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
 
     for( i = 0; i < h->param.i_threads; i++ )
     {
         if( i > 0 )
             *h->thread[i] = *h;
-        h->thread[i]->fdec = x264_frame_pop_unused( h );
+        h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
         if( !h->thread[i]->fdec )
             goto fail;
         CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
+        /* Start each thread with room for 8 NAL units; it'll realloc later if needed. */
+        CHECKED_MALLOC( h->thread[i]->out.nal, 8*sizeof(x264_nal_t) );
+        h->thread[i]->out.i_nals_allocated = 8;
         if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
             goto fail;
     }
 
-    if( x264_ratecontrol_new( h ) < 0 )
+    if( x264_lookahead_init( h, i_slicetype_length ) )
         goto fail;
 
-    if( x264_lowres_context_alloc( h ) )
+    if( x264_ratecontrol_new( h ) < 0 )
         goto fail;
 
     if( h->param.psz_dump_yuv )
@@ -929,6 +979,9 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
         COPY( analyse.b_transform_8x8 );
     if( h->frames.i_max_ref1 > 1 )
         COPY( b_bframe_pyramid );
+    COPY( i_slice_max_size );
+    COPY( i_slice_max_mbs );
+    COPY( i_slice_count );
 #undef COPY
 
     mbcmp_init( h );
@@ -947,11 +1000,53 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
     nal->i_payload= 0;
     nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
 }
-static void x264_nal_end( x264_t *h )
+static int x264_nal_end( x264_t *h )
 {
     x264_nal_t *nal = &h->out.nal[h->out.i_nal];
     nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
     h->out.i_nal++;
+
+    /* if number of allocated nals is not enough, re-allocate a larger one. */
+    if( h->out.i_nal >= h->out.i_nals_allocated )
+    {
+        x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
+        if( !new_out )
+            return -1;
+        memcpy( new_out, h->out.nal, sizeof(x264_nal_t) * (h->out.i_nals_allocated) );
+        x264_free( h->out.nal );
+        h->out.nal = new_out;
+        h->out.i_nals_allocated *= 2;
+    }
+    return 0;
+}
+
+static int x264_encoder_encapsulate_nals( x264_t *h )
+{
+    int nal_size = 0, i;
+    for( i = 0; i < h->out.i_nal; i++ )
+        nal_size += h->out.nal[i].i_payload;
+
+    /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
+    if( h->nal_buffer_size < nal_size * 3/2 + h->out.i_nal * 4 )
+    {
+        uint8_t *buf = x264_malloc( nal_size * 2 + h->out.i_nal * 4 );
+        if( !buf )
+            return -1;
+        x264_free( h->nal_buffer );
+        h->nal_buffer = buf;
+    }
+
+    uint8_t *nal_buffer = h->nal_buffer;
+
+    for( i = 0; i < h->out.i_nal; i++ )
+    {
+        int size = x264_nal_encode( nal_buffer, h->param.b_annexb, &h->out.nal[i] );
+        h->out.nal[i].i_payload = size;
+        h->out.nal[i].p_payload = nal_buffer;
+        nal_buffer += size;
+    }
+
+    return nal_buffer - h->nal_buffer;
 }
 
 /****************************************************************************
@@ -959,35 +1054,39 @@ static void x264_nal_end( x264_t *h )
  ****************************************************************************/
 int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
 {
+    int frame_size = 0;
     /* init bitstream context */
     h->out.i_nal = 0;
     bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
 
-    /* Put SPS and PPS */
-    if( h->i_frame == 0 )
-    {
-        /* identify ourself */
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        if( x264_sei_version_write( h, &h->out.bs ) )
-            return -1;
-        x264_nal_end( h );
+    /* Write SEI, SPS and PPS. */
+    x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
+    if( x264_sei_version_write( h, &h->out.bs ) )
+        return -1;
+    if( x264_nal_end( h ) )
+        return -1;
 
-        /* generate sequence parameters */
-        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
-        x264_sps_write( &h->out.bs, h->sps );
-        x264_nal_end( h );
+    /* generate sequence parameters */
+    x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+    x264_sps_write( &h->out.bs, h->sps );
+    if( x264_nal_end( h ) )
+        return -1;
+
+    /* generate picture parameters */
+    x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+    x264_pps_write( &h->out.bs, h->pps );
+    if( x264_nal_end( h ) )
+        return -1;
+    bs_flush( &h->out.bs );
+
+    frame_size = x264_encoder_encapsulate_nals( h );
 
-        /* generate picture parameters */
-        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-        x264_pps_write( &h->out.bs, h->pps );
-        x264_nal_end( h );
-    }
     /* now set output*/
     *pi_nal = h->out.i_nal;
     *pp_nal = &h->out.nal[0];
     h->out.i_nal = 0;
 
-    return 0;
+    return frame_size;
 }
 
 static inline void x264_reference_build_list( x264_t *h, int i_poc )
@@ -1139,39 +1238,23 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
 
 static inline int x264_reference_update( x264_t *h )
 {
-    int i;
-
-    if( h->fdec->i_frame >= 0 )
-        h->i_frame++;
-
     if( !h->fdec->b_kept_as_ref )
     {
         if( h->param.i_threads > 1 )
         {
             x264_frame_push_unused( h, h->fdec );
-            h->fdec = x264_frame_pop_unused( h );
+            h->fdec = x264_frame_pop_unused( h, 1 );
             if( !h->fdec )
                 return -1;
         }
         return 0;
     }
 
-    /* move lowres copy of the image to the ref frame */
-    for( i = 0; i < 4; i++)
-    {
-        XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] );
-        XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] );
-    }
-
-    /* adaptive B decision needs a pointer, since it can't use the ref lists */
-    if( h->sh.i_type != SLICE_TYPE_B )
-        h->frames.last_nonb = h->fdec;
-
     /* move frame in the buffer */
     x264_frame_push( h->frames.reference, h->fdec );
     if( h->frames.reference[h->frames.i_max_dpb] )
         x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
-    h->fdec = x264_frame_pop_unused( h );
+    h->fdec = x264_frame_pop_unused( h, 1 );
     if( !h->fdec )
         return -1;
     return 0;
@@ -1228,10 +1311,13 @@ static int x264_slice_write( x264_t *h )
 {
     int i_skip;
     int mb_xy, i_mb_x, i_mb_y;
-    int i, i_list, i_ref;
-
-    /* init stats */
-    memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
+    int i, i_list, i_ref, i_skip_bak = 0; /* Shut up GCC. */
+    bs_t bs_bak;
+    x264_cabac_t cabac_bak;
+    uint8_t cabac_prevbyte_bak = 0; /* Shut up GCC. */
+    /* Assume no more than 3 bytes of NALU escaping. */
+    int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-3-NALU_OVERHEAD)*8 : INT_MAX;
+    int starting_bits = bs_pos(&h->out.bs);
 
     /* Slice */
     x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
@@ -1254,22 +1340,35 @@ static int x264_slice_write( x264_t *h )
     i_mb_x = h->sh.i_first_mb % h->sps->i_mb_width;
     i_skip = 0;
 
-    while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) < h->sh.i_last_mb )
+    while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) <= h->sh.i_last_mb )
     {
         int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
+        if( h->param.i_slice_max_size > 0 )
+        {
+            /* We don't need the contexts because flushing the CABAC encoder has no context
+             * dependency and macroblocks are only re-encoded in the case where a slice is
+             * ended (and thus the content of all contexts are thrown away). */
+            if( h->param.b_cabac )
+            {
+                memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) );
+                /* x264's CABAC writer modifies the previous byte during carry, so it has to be
+                 * backed up. */
+                cabac_prevbyte_bak = h->cabac.p[-1];
+            }
+            else
+            {
+                bs_bak = h->out.bs;
+                i_skip_bak = i_skip;
+            }
+        }
 
-        if( i_mb_x == 0 )
+        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
             x264_fdec_filter_row( h, i_mb_y );
 
         /* load cache */
         x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
 
-        /* analyse parameters
-         * Slice I: choose I_4x4 or I_16x16 mode
-         * Slice P: choose between using P mode or intra (4x4 or 16x16)
-         * */
-        if( x264_macroblock_analyse( h ) )
-            return -1;
+        x264_macroblock_analyse( h );
 
         /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
         x264_macroblock_encode( h );
@@ -1306,6 +1405,37 @@ static int x264_slice_write( x264_t *h )
             }
         }
 
+        int total_bits = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
+        int mb_size = total_bits - mb_spos;
+
+        /* We'll just re-encode this last macroblock if we go over the max slice size. */
+        if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb )
+        {
+            if( mb_xy != h->sh.i_first_mb )
+            {
+                if( h->param.b_cabac )
+                {
+                    memcpy( &h->cabac, &cabac_bak, offsetof(x264_cabac_t, f8_bits_encoded) );
+                    h->cabac.p[-1] = cabac_prevbyte_bak;
+                }
+                else
+                {
+                    h->out.bs = bs_bak;
+                    i_skip = i_skip_bak;
+                }
+                h->mb.b_reencode_mb = 1;
+                h->sh.i_last_mb = mb_xy-1;
+                break;
+            }
+            else
+            {
+                h->sh.i_last_mb = mb_xy;
+                h->mb.b_reencode_mb = 0;
+            }
+        }
+        else
+            h->mb.b_reencode_mb = 0;
+
 #if VISUALIZE
         if( h->param.b_visualize )
             x264_visualize_mb( h );
@@ -1316,38 +1446,52 @@ static int x264_slice_write( x264_t *h )
 
         /* accumulate mb stats */
         h->stat.frame.i_mb_count[h->mb.i_type]++;
-        if( !IS_SKIP(h->mb.i_type) && !IS_INTRA(h->mb.i_type) && !IS_DIRECT(h->mb.i_type) )
+        if( h->param.i_log_level >= X264_LOG_INFO )
         {
-            if( h->mb.i_partition != D_8x8 )
-                h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
-            else
-                for( i = 0; i < 4; i++ )
-                    h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
-            if( h->param.i_frame_reference > 1 )
-                for( i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
+            if( !IS_SKIP(h->mb.i_type) && !IS_INTRA(h->mb.i_type) && !IS_DIRECT(h->mb.i_type) )
+            {
+                if( h->mb.i_partition != D_8x8 )
+                    h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
+                else
                     for( i = 0; i < 4; i++ )
-                    {
-                        i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ];
-                        if( i_ref >= 0 )
-                            h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
-                    }
-        }
-        if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma )
-        {
-            int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
-                       + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
-            int b_intra = IS_INTRA(h->mb.i_type);
-            h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
-            h->stat.frame.i_mb_cbp[!b_intra + 2] += h->mb.i_cbp_chroma >= 1;
-            h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma == 2;
-        }
-        if( h->mb.i_cbp_luma && !IS_INTRA(h->mb.i_type) )
-        {
-            h->stat.frame.i_mb_count_8x8dct[0] ++;
-            h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
+                        h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
+                if( h->param.i_frame_reference > 1 )
+                    for( i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
+                        for( i = 0; i < 4; i++ )
+                        {
+                            i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ];
+                            if( i_ref >= 0 )
+                                h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
+                        }
+            }
+            if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma )
+            {
+                int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
+                           + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
+                int b_intra = IS_INTRA(h->mb.i_type);
+                h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
+                h->stat.frame.i_mb_cbp[!b_intra + 2] += h->mb.i_cbp_chroma >= 1;
+                h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma == 2;
+            }
+            if( h->mb.i_cbp_luma && !IS_INTRA(h->mb.i_type) )
+            {
+                h->stat.frame.i_mb_count_8x8dct[0] ++;
+                h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
+            }
+            if( IS_INTRA(h->mb.i_type) && h->mb.i_type != I_PCM )
+            {
+                if( h->mb.i_type == I_16x16 )
+                    h->stat.frame.i_mb_pred_mode[0][h->mb.i_intra16x16_pred_mode]++;
+                else if( h->mb.i_type == I_8x8 )
+                    for( i = 0; i < 16; i += 4 )
+                        h->stat.frame.i_mb_pred_mode[1][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
+                else //if( h->mb.i_type == I_4x4 )
+                    for( i = 0; i < 16; i++ )
+                        h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
+            }
         }
 
-        x264_ratecontrol_mb( h, bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac) - mb_spos );
+        x264_ratecontrol_mb( h, mb_size );
 
         if( h->sh.b_mbaff )
         {
@@ -1356,7 +1500,7 @@ static int x264_slice_write( x264_t *h )
         }
         else
             i_mb_x++;
-        if(i_mb_x == h->sps->i_mb_width)
+        if( i_mb_x == h->sps->i_mb_width )
         {
             i_mb_y++;
             i_mb_x = 0;
@@ -1374,17 +1518,20 @@ static int x264_slice_write( x264_t *h )
             bs_write_ue( &h->out.bs, i_skip );  /* last skip run */
         /* rbsp_slice_trailing_bits */
         bs_rbsp_trailing( &h->out.bs );
+        bs_flush( &h->out.bs );
     }
+    if( x264_nal_end( h ) )
+        return -1;
 
-    x264_nal_end( h );
-
-    x264_fdec_filter_row( h, h->sps->i_mb_height );
+    if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
+    {
+        h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
+                                  + (h->out.i_nal*NALU_OVERHEAD * 8)
+                                  - h->stat.frame.i_tex_bits
+                                  - h->stat.frame.i_mv_bits;
+        x264_fdec_filter_row( h, h->sps->i_mb_height );
+    }
 
-    /* Compute misc bits */
-    h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
-                              + NALU_OVERHEAD * 8
-                              - h->stat.frame.i_tex_bits
-                              - h->stat.frame.i_mv_bits;
     return 0;
 }
 
@@ -1412,12 +1559,14 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
 {
     if( dst == src )
         return;
-    memcpy( &dst->stat.i_slice_count, &src->stat.i_slice_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
+    memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
 }
 
 static void *x264_slices_write( x264_t *h )
 {
-    int i_frame_size;
+    int i_slice_num = 0;
+    if( h->param.i_sync_lookahead )
+        x264_lower_thread_priority( 10 );
 
 #ifdef HAVE_MMX
     /* Misalign mask has to be set separately for each thread. */
@@ -1431,9 +1580,27 @@ static void *x264_slices_write( x264_t *h )
             return (void *)-1;
 #endif
 
-    if( x264_stack_align( x264_slice_write, h ) )
-        return (void *)-1;
-    i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
+    /* init stats */
+    memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
+    h->mb.b_reencode_mb = 0;
+    while( h->sh.i_first_mb < h->mb.i_mb_count )
+    {
+        h->sh.i_last_mb = h->mb.i_mb_count - 1;
+        if( h->param.i_slice_max_mbs )
+            h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
+        else if( h->param.i_slice_count )
+        {
+            x264_emms();
+            i_slice_num++;
+            double height = h->sps->i_mb_height >> h->param.b_interlaced;
+            int width = h->sps->i_mb_width << h->param.b_interlaced;
+            h->sh.i_last_mb = (int)(height * i_slice_num / h->param.i_slice_count + 0.5) * width - 1;
+        }
+        h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, h->mb.i_mb_count - 1 );
+        if( x264_stack_align( x264_slice_write, h ) )
+            return (void *)-1;
+        h->sh.i_first_mb = h->sh.i_last_mb + 1;
+    }
 
 #if VISUALIZE
     if( h->param.b_visualize )
@@ -1443,7 +1610,6 @@ static void *x264_slices_write( x264_t *h )
     }
 #endif
 
-    h->out.i_frame_size = i_frame_size;
     return (void *)0;
 }
 
@@ -1502,7 +1668,7 @@ int     x264_encoder_encode( x264_t *h,
     if( pic_in != NULL )
     {
         /* 1: Copy the picture to a frame and move it to a buffer */
-        x264_frame_t *fenc = x264_frame_pop_unused( h );
+        x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
         if( !fenc )
             return -1;
 
@@ -1515,8 +1681,6 @@ int     x264_encoder_encode( x264_t *h,
 
         fenc->i_frame = h->frames.i_input++;
 
-        x264_frame_push( h->frames.next, fenc );
-
         if( h->frames.b_have_lowres )
             x264_frame_init_lowres( h, fenc );
 
@@ -1528,55 +1692,34 @@ int     x264_encoder_encode( x264_t *h,
         else if( h->param.rc.i_aq_mode )
             x264_adaptive_quant_frame( h, fenc );
 
+        /* 2: Place the frame into the queue for its slice type decision */
+        x264_lookahead_put_frame( h, fenc );
+
         if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
         {
-            /* Nothing yet to encode */
-            /* waiting for filling bframe buffer */
+            /* Nothing yet to encode, waiting for filling of buffers */
             pic_out->i_type = X264_TYPE_AUTO;
             return 0;
         }
     }
-
-    if( h->frames.current[0] == NULL )
+    else
     {
-        int bframes = 0;
-        /* 2: Select frame types */
-        if( h->frames.next[0] == NULL )
-        {
-            if( x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ) < 0 )
-                return -1;
-            return 0;
-        }
+        /* signal kills for lookahead thread */
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+    }
 
-        x264_stack_align( x264_slicetype_decide, h );
+    h->i_frame++;
+    /* 3: The picture is analyzed in the lookahead */
+    if( !h->frames.current[0] )
+        x264_lookahead_get_frames( h );
 
-        /* 3: move some B-frames and 1 non-B to encode queue */
-        while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
-            bframes++;
-        x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) );
-        /* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */
-        if( h->param.b_bframe_pyramid && bframes > 1 )
-        {
-            x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] );
-            mid->i_type = X264_TYPE_BREF;
-            x264_frame_push( h->frames.current, mid );
-            bframes--;
-        }
-        while( bframes-- )
-            x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
-    }
+    if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
+        return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
 
     /* ------------------- Get frame to be encoded ------------------------- */
     /* 4: get picture to encode */
     h->fenc = x264_frame_shift( h->frames.current );
-    if( h->fenc == NULL )
-    {
-        /* Nothing yet to encode (ex: waiting for I/P with B frames) */
-        /* waiting for filling bframe buffer */
-        pic_out->i_type = X264_TYPE_AUTO;
-        return 0;
-    }
-
     if( h->fenc->param )
     {
         x264_encoder_reconfig( h, h->fenc->param );
@@ -1587,6 +1730,7 @@ int     x264_encoder_encode( x264_t *h,
     if( h->fenc->i_type == X264_TYPE_IDR )
     {
         h->frames.i_last_idr = h->fenc->i_frame;
+        h->i_frame_num = 0;
     }
 
     /* ------------------- Setup frame context ----------------------------- */
@@ -1638,22 +1782,9 @@ int     x264_encoder_encode( x264_t *h,
     /* build ref list 0/1 */
     x264_reference_build_list( h, h->fdec->i_poc );
 
-    /* Init the rate control */
-    x264_ratecontrol_start( h, h->fenc->i_qpplus1 );
-    i_global_qp = x264_ratecontrol_qp( h );
-
-    pic_out->i_qpplus1 =
-    h->fdec->i_qpplus1 = i_global_qp + 1;
-
     if( h->sh.i_type == SLICE_TYPE_B )
         x264_macroblock_bipred_init( h );
 
-    /* ------------------------ Create slice header  ----------------------- */
-    x264_slice_init( h, i_nal_type, i_global_qp );
-
-    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
-        h->i_frame_num++;
-
     /* ---------------------- Write the bitstream -------------------------- */
     /* Init bitstream context */
     h->out.i_nal = 0;
@@ -1674,12 +1805,15 @@ int     x264_encoder_encode( x264_t *h,
         x264_nal_start(h, NAL_AUD, NAL_PRIORITY_DISPOSABLE);
         bs_write(&h->out.bs, 3, pic_type);
         bs_rbsp_trailing(&h->out.bs);
-        x264_nal_end(h);
+        if( x264_nal_end( h ) )
+            return -1;
     }
 
     h->i_nal_type = i_nal_type;
     h->i_nal_ref_idc = i_nal_ref_idc;
 
+    int overhead = NALU_OVERHEAD;
+
     /* Write SPS and PPS */
     if( i_nal_type == NAL_SLICE_IDR && h->param.b_repeat_headers )
     {
@@ -1689,20 +1823,40 @@ int     x264_encoder_encode( x264_t *h,
             x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
             if( x264_sei_version_write( h, &h->out.bs ) )
                 return -1;
-            x264_nal_end( h );
+            if( x264_nal_end( h ) )
+                return -1;
+            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
         }
 
         /* generate sequence parameters */
         x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
         x264_sps_write( &h->out.bs, h->sps );
-        x264_nal_end( h );
+        if( x264_nal_end( h ) )
+            return -1;
+        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
 
         /* generate picture parameters */
         x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
         x264_pps_write( &h->out.bs, h->pps );
-        x264_nal_end( h );
+        if( x264_nal_end( h ) )
+            return -1;
+        overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
     }
 
+    /* Init the rate control */
+    /* FIXME: Include slice header bit cost. */
+    x264_ratecontrol_start( h, h->fenc->i_qpplus1, overhead*8 );
+    i_global_qp = x264_ratecontrol_qp( h );
+
+    pic_out->i_qpplus1 =
+    h->fdec->i_qpplus1 = i_global_qp + 1;
+
+    /* ------------------------ Create slice header  ----------------------- */
+    x264_slice_init( h, i_nal_type, i_global_qp );
+
+    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
+        h->i_frame_num++;
+
     /* Write frame */
     if( h->param.i_threads > 1 )
     {
@@ -1721,7 +1875,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
                                    x264_nal_t **pp_nal, int *pi_nal,
                                    x264_picture_t *pic_out )
 {
-    int i, i_list;
+    int i, j, i_list, frame_size;
     char psz_message[80];
 
     if( h->b_thread_active )
@@ -1743,6 +1897,9 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     /* End bitstream, set output  */
     *pi_nal = h->out.i_nal;
     *pp_nal = h->out.nal;
+
+    frame_size = x264_encoder_encapsulate_nals( h );
+
     h->out.i_nal = 0;
 
     /* Set output picture properties */
@@ -1765,21 +1922,18 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 
     /* update rc */
     x264_emms();
-    if( x264_ratecontrol_end( h, h->out.i_frame_size * 8 ) < 0 )
+    if( x264_ratecontrol_end( h, frame_size * 8 ) < 0 )
         return -1;
 
-    /* restore CPU state (before using float again) */
-    x264_emms();
-
     x264_noise_reduction_update( thread_current );
 
     /* ---------------------- Compute/Print statistics --------------------- */
     x264_thread_sync_stat( h, h->thread[0] );
 
     /* Slice stat */
-    h->stat.i_slice_count[h->sh.i_type]++;
-    h->stat.i_slice_size[h->sh.i_type] += h->out.i_frame_size + NALU_OVERHEAD;
-    h->stat.f_slice_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
+    h->stat.i_frame_count[h->sh.i_type]++;
+    h->stat.i_frame_size[h->sh.i_type] += frame_size;
+    h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
 
     for( i = 0; i < X264_MBTYPE_MAX; i++ )
         h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
@@ -1789,6 +1943,9 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
         h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i];
     for( i = 0; i < 6; i++ )
         h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i];
+    for( i = 0; i < 3; i++ )
+        for( j = 0; j < 13; j++ )
+            h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];
     if( h->sh.i_type != SLICE_TYPE_I )
         for( i_list = 0; i_list < 2; i_list++ )
             for( i = 0; i < 32; i++ )
@@ -1852,7 +2009,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
               h->stat.frame.i_mb_count_i,
               h->stat.frame.i_mb_count_p,
               h->stat.frame.i_mb_count_skip,
-              h->out.i_frame_size,
+              frame_size,
               psz_message );
 
     // keep stats all in one place
@@ -1881,7 +2038,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     if( h->param.psz_dump_yuv )
         x264_frame_dump( h );
 
-    return 0;
+    return frame_size;
 }
 
 static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_pcm, char *intra )
@@ -1908,6 +2065,8 @@ void    x264_encoder_close  ( x264_t *h )
                    || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                    || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
 
+    x264_lookahead_delete( h );
+
     for( i=0; i<h->param.i_threads; i++ )
     {
         // don't strictly have to wait for the other threads, but it's simpler than canceling them
@@ -1919,6 +2078,17 @@ void    x264_encoder_close  ( x264_t *h )
         }
     }
 
+    if( h->param.i_threads > 1 )
+    {
+        x264_t *thread_prev;
+
+        thread_prev = h->thread[ h->i_thread_phase % h->param.i_threads ];
+        x264_thread_sync_ratecontrol( h, thread_prev, h );
+        x264_thread_sync_ratecontrol( thread_prev, thread_prev, h );
+        h->i_frame = thread_prev->i_frame + 1 - h->param.i_threads;
+    }
+    h->i_frame++;
+
     /* Slices used and PSNR */
     for( i=0; i<5; i++ )
     {
@@ -1926,17 +2096,17 @@ void    x264_encoder_close  ( x264_t *h )
         static const char *slice_name[] = { "P", "B", "I", "SP", "SI" };
         int i_slice = slice_order[i];
 
-        if( h->stat.i_slice_count[i_slice] > 0 )
+        if( h->stat.i_frame_count[i_slice] > 0 )
         {
-            const int i_count = h->stat.i_slice_count[i_slice];
+            const int i_count = h->stat.i_frame_count[i_slice];
             if( h->param.analyse.b_psnr )
             {
                 x264_log( h, X264_LOG_INFO,
-                          "slice %s:%-5d Avg QP:%5.2f  size:%6.0f  PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
+                          "frame %s:%-5d Avg QP:%5.2f  size:%6.0f  PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
                           slice_name[i_slice],
                           i_count,
-                          h->stat.f_slice_qp[i_slice] / i_count,
-                          (double)h->stat.i_slice_size[i_slice] / i_count,
+                          h->stat.f_frame_qp[i_slice] / i_count,
+                          (double)h->stat.i_frame_size[i_slice] / i_count,
                           h->stat.f_psnr_mean_y[i_slice] / i_count, h->stat.f_psnr_mean_u[i_slice] / i_count, h->stat.f_psnr_mean_v[i_slice] / i_count,
                           h->stat.f_psnr_average[i_slice] / i_count,
                           x264_psnr( h->stat.i_ssd_global[i_slice], i_count * i_yuv_size ) );
@@ -1944,15 +2114,15 @@ void    x264_encoder_close  ( x264_t *h )
             else
             {
                 x264_log( h, X264_LOG_INFO,
-                          "slice %s:%-5d Avg QP:%5.2f  size:%6.0f\n",
+                          "frame %s:%-5d Avg QP:%5.2f  size:%6.0f\n",
                           slice_name[i_slice],
                           i_count,
-                          h->stat.f_slice_qp[i_slice] / i_count,
-                          (double)h->stat.i_slice_size[i_slice] / i_count );
+                          h->stat.f_frame_qp[i_slice] / i_count,
+                          (double)h->stat.i_frame_size[i_slice] / i_count );
             }
         }
     }
-    if( h->param.i_bframe && h->stat.i_slice_count[SLICE_TYPE_P] )
+    if( h->param.i_bframe && h->stat.i_frame_count[SLICE_TYPE_P] )
     {
         char *p = buf;
         int den = 0;
@@ -1972,17 +2142,17 @@ void    x264_encoder_close  ( x264_t *h )
         }
 
     /* MB types used */
-    if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
-        double i_count = h->stat.i_slice_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
+        double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO, "mb I  %s\n", buf );
     }
-    if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
-        double i_count = h->stat.i_slice_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
+        double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO,
@@ -1995,10 +2165,10 @@ void    x264_encoder_close  ( x264_t *h )
                   i_mb_size[PIXEL_4x4] / (i_count*4),
                   i_mb_count[P_SKIP] / i_count );
     }
-    if( h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
-        double i_count = h->stat.i_slice_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
+        double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
         double i_mb_list_count;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
         int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */
@@ -2031,7 +2201,7 @@ void    x264_encoder_close  ( x264_t *h )
 
     x264_ratecontrol_summary( h );
 
-    if( h->stat.i_slice_count[SLICE_TYPE_I] + h->stat.i_slice_count[SLICE_TYPE_P] + h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
     {
 #define SUM3(p) (p[SLICE_TYPE_I] + p[SLICE_TYPE_P] + p[SLICE_TYPE_B])
 #define SUM3b(p,o) (p[SLICE_TYPE_I][o] + p[SLICE_TYPE_P][o] + p[SLICE_TYPE_B][o])
@@ -2039,35 +2209,72 @@ void    x264_encoder_close  ( x264_t *h )
         int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 )
                                  + SUM3b( h->stat.i_mb_count, I_16x16 );
         int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM);
-        const int i_count = h->stat.i_slice_count[SLICE_TYPE_I] +
-                            h->stat.i_slice_count[SLICE_TYPE_P] +
-                            h->stat.i_slice_count[SLICE_TYPE_B];
+        const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] +
+                            h->stat.i_frame_count[SLICE_TYPE_P] +
+                            h->stat.i_frame_count[SLICE_TYPE_B];
         int64_t i_mb_count = i_count * h->mb.i_mb_count;
         float fps = (float) h->param.i_fps_num / h->param.i_fps_den;
-        float f_bitrate = fps * SUM3(h->stat.i_slice_size) / i_count / 125;
+        float f_bitrate = fps * SUM3(h->stat.i_frame_size) / i_count / 125;
 
         if( h->pps->b_transform_8x8_mode )
         {
-            x264_log( h, X264_LOG_INFO, "8x8 transform  intra:%.1f%%  inter:%.1f%%\n",
-                      100. * i_i8x8 / i_intra,
-                      100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
+            buf[0] = 0;
+            if( h->stat.i_mb_count_8x8dct[0] )
+                sprintf( buf, " inter:%.1f%%", 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
+            x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
         }
 
         if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
-            && h->stat.i_slice_count[SLICE_TYPE_B] )
+            && h->stat.i_frame_count[SLICE_TYPE_B] )
         {
-            x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%%  temporal:%.1f%%\n",
-                      h->stat.i_direct_frames[1] * 100. / h->stat.i_slice_count[SLICE_TYPE_B],
-                      h->stat.i_direct_frames[0] * 100. / h->stat.i_slice_count[SLICE_TYPE_B] );
+            x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%% temporal:%.1f%%\n",
+                      h->stat.i_direct_frames[1] * 100. / h->stat.i_frame_count[SLICE_TYPE_B],
+                      h->stat.i_direct_frames[0] * 100. / h->stat.i_frame_count[SLICE_TYPE_B] );
         }
 
-        x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra:%.1f%% %.1f%% %.1f%% inter:%.1f%% %.1f%% %.1f%%\n",
+        buf[0] = 0;
+        if( i_mb_count != i_all_intra )
+            sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%",
+                     h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
+                     h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)  ),
+                     h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
+        x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra: %.1f%% %.1f%% %.1f%%%s\n",
                   h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4),
                   h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra  ),
-                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra  ),
-                  h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
-                  h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)  ),
-                  h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
+                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra  ), buf );
+
+        int64_t fixed_pred_modes[3][9] = {{0}};
+        int64_t sum_pred_modes[3] = {0};
+        for( i = 0; i <= I_PRED_16x16_DC_128; i++ )
+        {
+            fixed_pred_modes[0][x264_mb_pred_mode16x16_fix[i]] += h->stat.i_mb_pred_mode[0][i];
+            sum_pred_modes[0] += h->stat.i_mb_pred_mode[0][i];
+        }
+        if( sum_pred_modes[0] )
+            x264_log( h, X264_LOG_INFO, "i16 v,h,dc,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
+                      fixed_pred_modes[0][0] * 100.0 / sum_pred_modes[0],
+                      fixed_pred_modes[0][1] * 100.0 / sum_pred_modes[0],
+                      fixed_pred_modes[0][2] * 100.0 / sum_pred_modes[0],
+                      fixed_pred_modes[0][3] * 100.0 / sum_pred_modes[0] );
+        for( i = 1; i <= 2; i++ )
+        {
+            for( j = 0; j <= I_PRED_8x8_DC_128; j++ )
+            {
+                fixed_pred_modes[i][x264_mb_pred_mode4x4_fix(j)] += h->stat.i_mb_pred_mode[i][j];
+                sum_pred_modes[i] += h->stat.i_mb_pred_mode[i][j];
+            }
+            if( sum_pred_modes[i] )
+                x264_log( h, X264_LOG_INFO, "i%d v,h,dc,ddl,ddr,vr,hd,vl,hu: %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", (3-i)*4,
+                          fixed_pred_modes[i][0] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][1] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][2] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][3] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][4] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][5] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][6] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][7] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] );
+        }
 
         for( i_list = 0; i_list < 2; i_list++ )
         {
@@ -2087,7 +2294,7 @@ void    x264_encoder_close  ( x264_t *h )
                     continue;
                 for( i = 0; i <= i_max; i++ )
                     p += sprintf( p, " %4.1f%%", 100. * h->stat.i_mb_count_ref[i_slice][i_list][i] / i_den );
-                x264_log( h, X264_LOG_INFO, "ref %c L%d %s\n", "PB"[i_slice], i_list, buf );
+                x264_log( h, X264_LOG_INFO, "ref %c L%d:%s\n", "PB"[i_slice], i_list, buf );
             }
         }
 
@@ -2109,7 +2316,7 @@ void    x264_encoder_close  ( x264_t *h )
                       f_bitrate );
         }
         else
-            x264_log( h, X264_LOG_INFO, "kb/s:%.1f\n", f_bitrate );
+            x264_log( h, X264_LOG_INFO, "kb/s:%.2f\n", f_bitrate );
     }
 
     /* rc */
@@ -2123,25 +2330,15 @@ void    x264_encoder_close  ( x264_t *h )
 
     x264_cqm_delete( h );
 
+    x264_analyse_free_costs( h );
+
     if( h->param.i_threads > 1)
         h = h->thread[ h->i_thread_phase % h->param.i_threads ];
 
     /* frames */
-    for( i = 0; h->frames.current[i]; i++ )
-    {
-        assert( h->frames.current[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.current[i] );
-    }
-    for( i = 0; h->frames.next[i]; i++ )
-    {
-        assert( h->frames.next[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.next[i] );
-    }
-    for( i = 0; h->frames.unused[i]; i++ )
-    {
-        assert( h->frames.unused[i]->i_reference_count == 0 );
-        x264_frame_delete( h->frames.unused[i] );
-    }
+    x264_frame_delete_list( h->frames.unused[0] );
+    x264_frame_delete_list( h->frames.unused[1] );
+    x264_frame_delete_list( h->frames.current );
 
     h = h->thread[0];
 
@@ -2164,6 +2361,7 @@ void    x264_encoder_close  ( x264_t *h )
 
         x264_macroblock_cache_end( h->thread[i] );
         x264_free( h->thread[i]->out.p_bitstream );
+        x264_free( h->thread[i]->out.nal);
         x264_free( h->thread[i] );
     }
 }
@@ -2180,7 +2378,12 @@ int x264_encoder_delayed_frames( x264_t *h )
     h = h->thread[ h->i_thread_phase % h->param.i_threads ];
     for( i=0; h->frames.current[i]; i++ )
         delayed_frames++;
-    for( i=0; h->frames.next[i]; i++ )
-        delayed_frames++;
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    delayed_frames += h->lookahead->ifbuf.i_size + h->lookahead->next.i_size + h->lookahead->ofbuf.i_size;
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
     return delayed_frames;
 }