else
p->i_threads = atoi(value);
}
+ OPT("sliced-threads")
+ p->b_sliced_threads = atobool(value);
OPT("sync-lookahead")
{
if( !strcmp(value, "auto") )
s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
s += sprintf( s, " threads=%d", p->i_threads );
+ s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
if( p->i_slice_count )
s += sprintf( s, " slices=%d", p->i_slice_count );
if( p->i_slice_max_size )
x264_pthread_t thread_handle;
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
+ int i_threadslice_start; /* first row in this thread slice */
+ int i_threadslice_end; /* row after the end of this thread slice */
/* bitstream output */
struct
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
- /* fdec: fenc:
- * yyyyyyy
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * uuu vvv UUVV
- * uUU vVV UUVV
- * uUU vVV
- */
- h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
- h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
- h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
- h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
-
- h->mb.i_neighbour4[6] =
- h->mb.i_neighbour4[9] =
- h->mb.i_neighbour4[12] =
- h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
- h->mb.i_neighbour4[3] =
- h->mb.i_neighbour4[7] =
- h->mb.i_neighbour4[11] =
- h->mb.i_neighbour4[13] =
- h->mb.i_neighbour4[15] =
- h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
-
- int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
- int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
- int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
- int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
- ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
- int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
- CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
-
return 0;
fail: return -1;
}
x264_free( h->mb.skipbp );
x264_free( h->mb.cbp );
x264_free( h->mb.qp );
- x264_free( h->scratch_buffer );
}
void x264_macroblock_slice_init( x264_t *h )
{
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
setup_inverse_delta_pocs( h );
+
+ /* fdec: fenc:
+ * yyyyyyy
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * yYYYY YYYY
+ * uuu vvv UUVV
+ * uUU vVV UUVV
+ * uUU vVV
+ */
+ h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
+ h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+ h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
+ h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+
+ h->mb.i_neighbour4[6] =
+ h->mb.i_neighbour4[9] =
+ h->mb.i_neighbour4[12] =
+ h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
+ h->mb.i_neighbour4[3] =
+ h->mb.i_neighbour4[7] =
+ h->mb.i_neighbour4[11] =
+ h->mb.i_neighbour4[13] =
+ h->mb.i_neighbour4[15] =
+ h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
}
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
const int i_pix_offset = h->mb.b_interlaced
? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
: w * (i_mb_x + i_mb_y * i_stride);
+ const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
+ &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
- const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k;
if( h->mb.b_interlaced )
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
h->mb.pic.p_fenc_plane[i], i_stride2, w );
- memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+ if( i_mb_y > 0 )
+ memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+ else
+ memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
if( h->mb.b_interlaced || h->mb.b_reencode_mb )
- {
- const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
- }
for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
{
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
}
}
+void x264_analyse_weight_frame( x264_t *h, int end )
+{
+ int j;
+ for( j=0; j<h->i_ref0; j++ )
+ {
+ if( h->sh.weight[j][0].weightfn )
+ {
+ x264_frame_t *frame = h->fref0[j];
+ int width = frame->i_width[0] + 2*PADH;
+ int i_padv = PADV << h->param.b_interlaced;
+ int offset, height;
+ uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+ int k;
+ height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
+ offset = h->fenc->i_lines_weighted*frame->i_stride[0];
+ h->fenc->i_lines_weighted += height;
+ if( height )
+ {
+ for( k = j; k < h->i_ref0; k++ )
+ if( h->sh.weight[k][0].weightfn )
+ {
+ uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+ x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
+ src + offset, frame->i_stride[0],
+ width, height, &h->sh.weight[k][0] );
+ }
+ }
+ break;
+ }
+ }
+}
+
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
- if( h->mb.i_mb_x == 0)
+ if( h->mb.i_mb_x == 0 )
{
int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
int thread_mvy_range = i_fmv_range;
- if( h->param.i_threads > 1 )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
int thresh = pix_y + h->param.analyse.i_mv_range_thread;
if( h->mb.b_interlaced )
thread_mvy_range >>= 1;
- for( j=0; j<h->i_ref0; j++ )
- {
- if( h->sh.weight[j][0].weightfn )
- {
- x264_frame_t *frame = h->fref0[j];
- int width = frame->i_width[0] + 2*PADH;
- int i_padv = PADV << h->param.b_interlaced;
- int offset, height;
- uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
- int k;
- height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
- offset = h->fenc->i_lines_weighted*frame->i_stride[0];
- h->fenc->i_lines_weighted += height;
- if( height )
- {
- for( k = j; k < h->i_ref0; k++ )
- if( h->sh.weight[k][0].weightfn )
- {
- uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
- x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
- src + offset, frame->i_stride[0],
- width, height, &h->sh.weight[k][0] );
- }
- }
- break;
- }
- }
+ x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
}
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
{
h->mb.i_type = P_SKIP;
x264_analyse_update_cache( h, a );
- assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+ assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
return;
}
}
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
- assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+ assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
h->mb.i_type = P_L0;
if( a->i_mbrd )
analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip )
{
- if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
// FIXME don't need to check this if the reference frame is done
{}
else if( h->param.analyse.i_subpel_refine >= 3 )
{
h->mb.i_type = P_SKIP;
h->mb.i_partition = D_16x16;
- assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+ assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
}
else
{
}
#ifndef NDEBUG
- if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads && !IS_INTRA(h->mb.i_type) )
{
int l;
for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
int x264_analyse_init_costs( x264_t *h, int qp );
void x264_analyse_free_costs( x264_t *h );
+void x264_analyse_weight_frame( x264_t *h, int end );
void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
#include "analyse.h"
#include "ratecontrol.h"
#include "macroblock.h"
+#include "me.h"
#if VISUALIZE
#include "common/visualize.h"
x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
h->param.i_threads = 1;
#endif
+ /* Avoid absurdly small thread slices as they can reduce performance
+ * and VBV compliance. Capped at an arbitrary 4 rows per thread. */
+ if( h->param.b_sliced_threads )
+ {
+ int max_threads = (h->param.i_height+15)/16 / 4;
+ h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
+ }
}
+ else
+ h->param.b_sliced_threads = 0;
if( h->param.b_interlaced )
{
h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
- h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
- h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
- h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
- if( h->param.b_interlaced && h->param.i_slice_max_size )
- {
- x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
- h->param.i_slice_max_size = 0;
- }
- if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+ if( h->param.b_sliced_threads )
+ h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
+ else
{
- x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
- h->param.i_slice_max_mbs = 0;
+ h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
+ h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
+ h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+ if( h->param.b_interlaced && h->param.i_slice_max_size )
+ {
+ x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+ h->param.i_slice_max_size = 0;
+ }
+ if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+ {
+ x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+ h->param.i_slice_max_mbs = 0;
+ }
+ if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
+ h->param.i_slice_count = 0;
}
- if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
- h->param.i_slice_count = 0;
h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
if( h->param.i_keyint_max <= 0 )
#ifdef HAVE_PTHREAD
if( h->param.i_sync_lookahead )
h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
- if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
+ if( h->param.rc.b_stat_read || h->param.i_threads == 1 || h->param.b_sliced_threads )
h->param.i_sync_lookahead = 0;
#else
h->param.i_sync_lookahead = 0;
if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
- if( h->param.i_threads > 1 )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
int r = h->param.analyse.i_mv_range_thread;
int r2;
if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
i_slicetype_length = h->frames.i_delay;
- h->frames.i_delay += h->param.i_threads - 1;
+ if( !h->param.b_sliced_threads )
+ h->frames.i_delay += h->param.i_threads - 1;
h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
h->frames.i_delay += h->param.i_sync_lookahead;
for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+ if( x264_lookahead_init( h, i_slicetype_length ) )
+ goto fail;
+
for( i = 0; i < h->param.i_threads; i++ )
{
+ int init_nal_count = h->param.i_slice_count + 3;
+ int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
if( i > 0 )
*h->thread[i] = *h;
- h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
- if( !h->thread[i]->fdec )
- goto fail;
+
+ if( allocate_threadlocal_data )
+ {
+ h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
+ if( !h->thread[i]->fdec )
+ goto fail;
+ }
+ else
+ h->thread[i]->fdec = h->thread[0]->fdec;
+
CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
- /* Start each thread with room for 8 NAL units; it'll realloc later if needed. */
- CHECKED_MALLOC( h->thread[i]->out.nal, 8*sizeof(x264_nal_t) );
- h->thread[i]->out.i_nals_allocated = 8;
- if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
+ /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
+ CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
+ h->thread[i]->out.i_nals_allocated = init_nal_count;
+
+ if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
goto fail;
}
- if( x264_lookahead_init( h, i_slicetype_length ) )
- goto fail;
+ /* Allocate scratch buffer */
+ for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+ {
+ int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
+ int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+ int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+ int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+ ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+ int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+ CHECKED_MALLOC( h->thread[i]->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
+ }
if( x264_ratecontrol_new( h ) < 0 )
goto fail;
COPY( b_deblocking_filter );
COPY( i_deblocking_filter_alphac0 );
COPY( i_deblocking_filter_beta );
- COPY( analyse.intra );
COPY( analyse.inter );
+ COPY( analyse.intra );
COPY( analyse.i_direct_mv_pred );
/* Scratch buffer prevents me_range from being increased for esa/tesa */
if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
nal->i_payload= 0;
nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
}
-static int x264_nal_end( x264_t *h )
+/* if number of allocated nals is not enough, re-allocate a larger one. */
+static int x264_nal_check_buffer( x264_t *h )
{
- x264_nal_t *nal = &h->out.nal[h->out.i_nal];
- nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
- h->out.i_nal++;
-
- /* if number of allocated nals is not enough, re-allocate a larger one. */
if( h->out.i_nal >= h->out.i_nals_allocated )
{
x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
}
return 0;
}
+static int x264_nal_end( x264_t *h )
+{
+ x264_nal_t *nal = &h->out.nal[h->out.i_nal];
+ nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
+ h->out.i_nal++;
+
+ return x264_nal_check_buffer( h );
+}
static int x264_encoder_encapsulate_nals( x264_t *h )
{
if( min_y < 0 )
return;
- if( !b_end )
+ if( !b_end && !h->param.b_sliced_threads )
{
int i, j;
for( j=0; j<=h->sh.b_mbaff; j++ )
}
}
- if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref )
- {
+ if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref && !h->param.b_sliced_threads )
x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
- }
min_y = X264_MAX( min_y*16-8, 0 );
max_y = b_end ? h->param.i_height : mb_y*16-8;
int i, j;
if( !h->fdec->b_kept_as_ref )
{
- if( h->param.i_threads > 1 )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
x264_frame_push_unused( h, h->fdec );
h->fdec = x264_frame_pop_unused( h, 1 );
{
/* Nothing to do ? */
}
-
- x264_macroblock_slice_init( h );
}
static int x264_slice_write( x264_t *h )
x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
/* Slice header */
+ x264_macroblock_slice_init( h );
x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
if( h->param.b_cabac )
{
}
}
- if( i_mb_x == 0 && !h->mb.b_reencode_mb )
+ if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
x264_fdec_filter_row( h, i_mb_y );
/* load cache */
+ (h->out.i_nal*NALU_OVERHEAD * 8)
- h->stat.frame.i_tex_bits
- h->stat.frame.i_mv_bits;
- x264_fdec_filter_row( h, h->sps->i_mb_height );
+ if( !h->param.b_sliced_threads )
+ x264_fdec_filter_row( h, h->sps->i_mb_height );
}
return 0;
static void x264_thread_sync_context( x264_t *dst, x264_t *src )
{
- x264_frame_t **f;
if( dst == src )
return;
// reference counting
+ x264_frame_t **f;
for( f = src->frames.reference; *f; f++ )
(*f)->i_reference_count++;
for( f = dst->frames.reference; *f; f++ )
static void *x264_slices_write( x264_t *h )
{
int i_slice_num = 0;
+ int last_thread_mb = h->sh.i_last_mb;
if( h->param.i_sync_lookahead )
x264_lower_thread_priority( 10 );
/* init stats */
memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
h->mb.b_reencode_mb = 0;
- while( h->sh.i_first_mb < h->mb.i_mb_count )
+ while( h->sh.i_first_mb <= last_thread_mb )
{
- h->sh.i_last_mb = h->mb.i_mb_count - 1;
+ h->sh.i_last_mb = last_thread_mb;
if( h->param.i_slice_max_mbs )
h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
- else if( h->param.i_slice_count )
+ else if( h->param.i_slice_count && !h->param.b_sliced_threads )
{
- x264_emms();
- i_slice_num++;
- double height = h->sps->i_mb_height >> h->param.b_interlaced;
+ int height = h->sps->i_mb_height >> h->param.b_interlaced;
int width = h->sps->i_mb_width << h->param.b_interlaced;
- h->sh.i_last_mb = (int)(height * i_slice_num / h->param.i_slice_count + 0.5) * width - 1;
+ i_slice_num++;
+ h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
}
- h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, h->mb.i_mb_count - 1 );
+ h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
if( x264_stack_align( x264_slice_write, h ) )
return (void *)-1;
h->sh.i_first_mb = h->sh.i_last_mb + 1;
return (void *)0;
}
+static int x264_threaded_slices_write( x264_t *h )
+{
+ int i, j;
+ void *ret = NULL;
+ /* set first/last mb and sync contexts */
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[i];
+ if( i )
+ {
+ t->param = h->param;
+ memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
+ }
+ int height = h->sps->i_mb_height >> h->param.b_interlaced;
+ t->i_threadslice_start = ((height * i + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+ t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+ t->sh.i_first_mb = t->i_threadslice_start * h->sps->i_mb_width;
+ t->sh.i_last_mb = t->i_threadslice_end * h->sps->i_mb_width - 1;
+ }
+
+ x264_analyse_weight_frame( h, h->sps->i_mb_height*16 + 16 );
+
+ x264_threads_distribute_ratecontrol( h );
+
+ /* dispatch */
+ for( i = 0; i < h->param.i_threads; i++ )
+ if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
+ return -1;
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_pthread_join( h->thread[i]->thread_handle, &ret );
+ if( (intptr_t)ret )
+ return (intptr_t)ret;
+ }
+
+ /* deblocking and hpel filtering */
+ for( i = 0; i <= h->sps->i_mb_height; i++ )
+ x264_fdec_filter_row( h, i );
+
+ for( i = 1; i < h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[i];
+ for( j = 0; j < t->out.i_nal; j++ )
+ {
+ h->out.nal[h->out.i_nal] = t->out.nal[j];
+ h->out.i_nal++;
+ x264_nal_check_buffer( h );
+ }
+ /* All entries in stat.frame are ints except for ssd/ssim,
+ * which are only calculated in the main thread. */
+ for( j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
+ ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
+ }
+
+ x264_threads_merge_ratecontrol( h );
+
+ return 0;
+}
+
/****************************************************************************
* x264_encoder_encode:
* XXX: i_poc : is the poc of the current given picture
x264_picture_t *pic_out )
{
x264_t *thread_current, *thread_prev, *thread_oldest;
- int i_nal_type;
- int i_nal_ref_idc;
+ int i_nal_type, i_nal_ref_idc, i_global_qp, i;
- int i_global_qp;
-
- if( h->param.i_threads > 1)
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
thread_prev = h->thread[ h->i_thread_phase ];
h->i_thread_phase = (h->i_thread_phase + 1) % h->param.i_threads;
/* 2: Place the frame into the queue for its slice type decision */
x264_lookahead_put_frame( h, fenc );
- if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
+ if( h->frames.i_input <= h->frames.i_delay + (h->param.b_sliced_threads ? 0 : 1 - h->param.i_threads) )
{
/* Nothing yet to encode, waiting for filling of buffers */
pic_out->i_type = X264_TYPE_AUTO;
/* ---------------------- Write the bitstream -------------------------- */
/* Init bitstream context */
- h->out.i_nal = 0;
- bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+ if( h->param.b_sliced_threads )
+ {
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
+ h->thread[i]->out.i_nal = 0;
+ }
+ }
+ else
+ {
+ bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+ h->out.i_nal = 0;
+ }
if( h->param.b_aud )
{
h->i_frame_num++;
/* Write frame */
- if( h->param.i_threads > 1 )
+ h->i_threadslice_start = 0;
+ h->i_threadslice_end = h->sps->i_mb_height;
+ if( !h->param.b_sliced_threads && h->param.i_threads > 1 )
{
if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
return -1;
h->b_thread_active = 1;
}
+ else if( h->param.b_sliced_threads )
+ {
+ if( x264_threaded_slices_write( h ) )
+ return -1;
+ }
else
if( (intptr_t)x264_slices_write( h ) )
return -1;
x264_lookahead_delete( h );
- for( i=0; i<h->param.i_threads; i++ )
+ for( i = 0; i < h->param.i_threads; i++ )
{
// don't strictly have to wait for the other threads, but it's simpler than canceling them
if( h->thread[i]->b_thread_active )
}
}
- if( h->param.i_threads > 1 )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
x264_t *thread_prev;
{
x264_frame_t **frame;
- for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+ if( !h->param.b_sliced_threads || i == 0 )
{
+ for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+ {
+ assert( (*frame)->i_reference_count > 0 );
+ (*frame)->i_reference_count--;
+ if( (*frame)->i_reference_count == 0 )
+ x264_frame_delete( *frame );
+ }
+ frame = &h->thread[i]->fdec;
assert( (*frame)->i_reference_count > 0 );
(*frame)->i_reference_count--;
if( (*frame)->i_reference_count == 0 )
x264_frame_delete( *frame );
+ x264_macroblock_cache_end( h->thread[i] );
}
- frame = &h->thread[i]->fdec;
- assert( (*frame)->i_reference_count > 0 );
- (*frame)->i_reference_count--;
- if( (*frame)->i_reference_count == 0 )
- x264_frame_delete( *frame );
-
- x264_macroblock_cache_end( h->thread[i] );
+ x264_free( h->thread[i]->scratch_buffer );
x264_free( h->thread[i]->out.p_bitstream );
x264_free( h->thread[i]->out.nal);
x264_free( h->thread[i] );
x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
x264_macroblock_cache_end( h->thread[h->param.i_threads] );
+ x264_free( h->thread[h->param.i_threads]->scratch_buffer );
x264_free( h->thread[h->param.i_threads] );
}
x264_synch_frame_list_delete( &h->lookahead->ifbuf );
double lmin[5]; /* min qscale by frame type */
double lmax[5];
double lstep; /* max change (multiply) in qscale per frame */
- uint16_t *qp_buffer; /* Global buffer for converting MB-tree quantizer data. */
+ uint16_t *qp_buffer; /* Global buffer for converting MB-tree quantizer data. */
/* MBRC stuff */
double frame_size_estimated;
double frame_size_planned;
+ double slice_size_planned;
predictor_t (*row_pred)[2];
predictor_t row_preds[5][2];
predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
{
int i;
double bits = 0;
- for( i = 0; i <= y; i++ )
+ for( i = h->i_threadslice_start; i <= y; i++ )
bits += h->fdec->i_row_bits[i];
return bits;
}
{
int i;
double bits = row_bits_so_far(h, y);
- for( i = y+1; i < h->sps->i_mb_height; i++ )
+ for( i = y+1; i < h->i_threadslice_end; i++ )
bits += predict_row_size( h, i, qp );
return bits;
}
}
/* tweak quality based on difference from predicted size */
- if( y < h->sps->i_mb_height-1 )
+ if( y < h->i_threadslice_end-1 )
{
int prev_row_qp = h->fdec->i_row_qp[y];
int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
rc->qpm = X264_MAX( rc->qpm, i_qp_min );
}
- int b0 = predict_row_size_sum( h, y, rc->qpm );
- int b1 = b0;
float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
-
+ float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
+ float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
/* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
+ int b1 = predict_row_size_sum( h, y, rc->qpm );
+
+ /* Assume that if this slice has become larger than expected,
+ * the other slices will have gotten equally larger. */
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
/* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
/* area at the top of the frame was measured inaccurately. */
- if( row_bits_so_far(h,y) < 0.05 * rc->frame_size_planned )
+ if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
return;
if( h->sh.i_type != SLICE_TYPE_I )
{
rc->qpm ++;
b1 = predict_row_size_sum( h, y, rc->qpm );
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
}
while( rc->qpm > i_qp_min
{
rc->qpm --;
b1 = predict_row_size_sum( h, y, rc->qpm );
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
}
/* avoid VBV underflow */
{
rc->qpm ++;
b1 = predict_row_size_sum( h, y, rc->qpm );
+ b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
}
x264_ratecontrol_set_estimated_size(h, b1);
{
x264_ratecontrol_t *rcc = h->rc;
rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
- if( h->param.i_threads > 1 )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
int j = h->rc - h->thread[0]->rc;
int i;
{
double frame_q[3];
double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
- double buffer_fill_cur = rcc->buffer_fill - cur_bits + rcc->buffer_rate;
+ double buffer_fill_cur = rcc->buffer_fill - cur_bits;
double target_fill;
frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
/* Loop over the planned future frames. */
for( j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
{
+ buffer_fill_cur += rcc->buffer_rate;
int i_type = h->fenc->i_planned_type[j];
int i_satd = h->fenc->i_planned_satd[j];
if( i_type == X264_TYPE_AUTO )
break;
i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
- buffer_fill_cur = buffer_fill_cur - cur_bits + rcc->buffer_rate;
+ buffer_fill_cur -= cur_bits;
}
/* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
target_fill = X264_MIN( rcc->buffer_fill + j * rcc->buffer_rate * 0.5, rcc->buffer_size * 0.5 );
if( rcc->b_vbv )
{
- if( h->param.i_threads > 1 )
+ if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
int j = h->rc - h->thread[0]->rc;
int i;
}
}
+void x264_threads_distribute_ratecontrol( x264_t *h )
+{
+ int i, row, totalsize = 0;
+ if( h->rc->b_vbv )
+ for( row = 0; row < h->sps->i_mb_height; row++ )
+ totalsize += h->fdec->i_row_satd[row];
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_ratecontrol_t *t = h->thread[i]->rc;
+ x264_ratecontrol_t *rc = h->rc;
+ memcpy( t, rc, sizeof( x264_ratecontrol_t ) );
+ /* Calculate the planned slice size. */
+ if( h->rc->b_vbv && rc->frame_size_planned )
+ {
+ int size = 0;
+ for( row = h->i_threadslice_start; row < h->i_threadslice_end; row++ )
+ size += h->fdec->i_row_satd[row];
+ t->slice_size_planned = size * rc->frame_size_planned / totalsize;
+ }
+ else
+ t->slice_size_planned = 0;
+ }
+}
+
+void x264_threads_merge_ratecontrol( x264_t *h )
+{
+ int i, j, k;
+ x264_ratecontrol_t *rc = h->rc;
+ x264_emms();
+
+ for( i = 1; i < h->param.i_threads; i++ )
+ {
+ x264_ratecontrol_t *t = h->thread[i]->rc;
+ rc->qpa_rc += t->qpa_rc;
+ rc->qpa_aq += t->qpa_aq;
+ for( j = 0; j < 5; j++ )
+ for( k = 0; k < 2; k++ )
+ {
+ rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
+ rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
+ rc->row_preds[j][k].count += t->row_preds[j][k].count;
+ }
+ }
+ for( j = 0; j < 5; j++ )
+ for( k = 0; k < 2; k++ )
+ {
+ rc->row_preds[j][k].coeff /= h->param.i_threads;
+ rc->row_preds[j][k].offset /= h->param.i_threads;
+ rc->row_preds[j][k].count /= h->param.i_threads;
+ }
+}
+
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
{
if( cur != prev )
int x264_ratecontrol_get_estimated_size( x264_t const *);
int x264_rc_analyse_slice( x264_t *h );
int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_threads_distribute_ratecontrol( x264_t *h );
+void x264_threads_merge_ratecontrol( x264_t *h );
#endif
H0( "Example usage:\n" );
H0( "\n" );
H0( " Constant quality mode:\n" );
- H0( " x264 --crf 24 -o output input\n" );
+ H0( " x264 --crf 24 -o <output> <input>\n" );
H0( "\n" );
H0( " Two-pass with a bitrate of 1000kbps:\n" );
- H0( " x264 --pass 1 --bitrate 1000 -o output input\n" );
- H0( " x264 --pass 2 --bitrate 1000 -o output input\n" );
+ H0( " x264 --pass 1 --bitrate 1000 -o <output> <input>\n" );
+ H0( " x264 --pass 2 --bitrate 1000 -o <output> <input>\n" );
H0( "\n" );
H0( " Lossless:\n" );
- H0( " x264 --crf 0 -o output input\n" );
+ H0( " x264 --crf 0 -o <output> <input>\n" );
H0( "\n" );
H0( " Maximum PSNR at the cost of speed and visual quality:\n" );
- H0( " x264 --preset placebo --tune psnr -o output input\n" );
+ H0( " x264 --preset placebo --tune psnr -o <output> <input>\n" );
H0( "\n" );
H0( " Constant bitrate at 1000kbps with a 2 second-buffer:\n");
- H0( " x264 --vbv-bufsize 2000 --bitrate 1000 -o output input\n" );
+ H0( " x264 --vbv-bufsize 2000 --bitrate 1000 -o <output> <input>\n" );
H0( "\n" );
H0( "Presets:\n" );
H0( "\n" );
" - fastdecode:\n"
" --no-cabac --no-deblock --no-weightb\n"
" --weightp 0\n"
+ " - zerolatency:\n"
+ " --bframes 0 --rc-lookahead 0\n"
+ " --sync-lookahead 0 --sliced-threads\n"
" - touhou:\n"
" --aq-strength 1.3 --deblock -1:-1\n"
" --partitions {p4x4 if p8x8 set}\n"
" --psy-rd <unset>:0.2\n"
" --ref {Double if >1 else 1}\n" );
- else H0( " - film,animation,grain,psnr,ssim,fastdecode\n" );
+ else H0( " - film,animation,grain,psnr,ssim\n"
+ " - fastdecode,zerolatency\n" );
H1( " --slow-firstpass Don't use faster settings with --pass 1\n" );
H0( "\n" );
H0( "Frame-type options:\n" );
H1( " --psnr Enable PSNR computation\n" );
H1( " --ssim Enable SSIM computation\n" );
H1( " --threads <integer> Force a specific number of threads\n" );
+ H2( " --sliced-threads Low-latency but lower-efficiency threading\n" );
H2( " --thread-input Run Avisynth in its own thread\n" );
H2( " --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
H2( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" );
{ "zones", required_argument, NULL, 0 },
{ "qpfile", required_argument, NULL, OPT_QPFILE },
{ "threads", required_argument, NULL, 0 },
+ { "sliced-threads", no_argument, NULL, 0 },
+ { "no-sliced-threads", no_argument, NULL, 0 },
{ "slice-max-size", required_argument, NULL, 0 },
{ "slice-max-mbs", required_argument, NULL, 0 },
{ "slices", required_argument, NULL, 0 },
param->analyse.b_weighted_bipred = 0;
param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
}
+ else if( !strcasecmp( optarg, "zerolatency" ) )
+ {
+ param->rc.i_lookahead = 0;
+ param->i_sync_lookahead = 0;
+ param->i_bframe = 0;
+ param->b_sliced_threads = 1;
+ }
else if( !strcasecmp( optarg, "touhou" ) )
{
param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
#include <stdarg.h>
-#define X264_BUILD 79
+#define X264_BUILD 80
/* x264_t:
* opaque handler for encoder */
/* CPU flags */
unsigned int cpu;
int i_threads; /* encode multiple frames in parallel */
+ int b_sliced_threads; /* Whether to use slice-based threading. */
int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
int i_sync_lookahead; /* threaded lookahead buffer */