ALIGN 16
.vertical_filter:
- prefetchnta [src + stride5 + 32]
+ prefetcht0 [src + stride5 + 32]
LOAD_ADD mm1, [src ], [src + stride5 ] ; a0
LOAD_ADD mm2, [src + stride ], [src + stride*4 ] ; b0
/* CPU autodetect */
param->cpu = x264_cpu_detect();
param->i_threads = 1;
+ param->b_deterministic = 1;
/* Video properties */
param->i_csp = X264_CSP_I420;
param->analyse.i_me_range = 16;
param->analyse.i_subpel_refine = 5;
param->analyse.b_chroma_me = 1;
+ param->analyse.i_mv_range_thread = -1;
param->analyse.i_mv_range = -1; // set from level_idc
param->analyse.i_direct_8x8_inference = -1; // set from level_idc
param->analyse.i_chroma_qp_offset = 0;
else
p->i_threads = atoi(value);
}
+ OPT2("deterministic", "n-deterministic")
+ p->b_deterministic = atobool(value);
OPT2("level", "level-idc")
{
if( atof(value) < 6 )
}
OPT("scenecut")
p->i_scenecut_threshold = atoi(value);
+ OPT("pre-scenecut")
+ p->b_pre_scenecut = atobool(value);
OPT("bframes")
p->i_bframe = atoi(value);
OPT("b-adapt")
b_error |= parse_enum( value, x264_motion_est_names, &p->analyse.i_me_method );
OPT2("merange", "me-range")
p->analyse.i_me_range = atoi(value);
- OPT("mvrange")
+ OPT2("mvrange", "mv-range")
p->analyse.i_mv_range = atoi(value);
+ OPT2("mvrange-thread", "mv-range-thread")
+ p->analyse.i_mv_range_thread = atoi(value);
OPT2("subme", "subq")
p->analyse.i_subpel_refine = atoi(value);
OPT("bime")
s += sprintf( s, " cqm=%d", p->i_cqm_preset );
s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
- s += sprintf( s, " slices=%d", p->i_threads );
+ s += sprintf( s, " threads=%d", p->i_threads );
s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
s += sprintf( s, " mbaff=%d", p->b_interlaced );
p->analyse.b_bidir_me );
}
- s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
- p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
+ s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d%s",
+ p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold,
+ p->b_pre_scenecut ? "(pre)" : "" );
s += sprintf( s, " rc=%s", p->rc.i_rc_method == X264_RC_ABR ?
( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size ? "cbr" : "abr" )
#define pthread_create(t,u,f,d) *(t)=CreateThread(NULL,0,f,d,0,NULL)
#define pthread_join(t,s) { WaitForSingleObject(t,INFINITE); \
CloseHandle(t); }
+#define usleep(t) Sleep((t+999)/1000);
#define HAVE_PTHREAD 1
#elif defined(SYS_BEOS)
resume_thread(*(t)); }
#define pthread_join(t,s) { long tmp; \
wait_for_thread(t,(s)?(long*)(s):&tmp); }
+#ifndef usleep
+#define usleep(t) snooze(t)
+#endif
#define HAVE_PTHREAD 1
#elif defined(HAVE_PTHREAD)
#include <pthread.h>
+#else
+#define pthread_t int
+#define pthread_create(t,u,f,d)
+#define pthread_join(t,s)
#endif
/****************************************************************************
#define XCHG(type,a,b) { type t = a; a = b; b = t; }
#define FIX8(f) ((int)(f*(1<<8)+.5))
+#ifndef offsetof
+#define offsetof(T,F) ((unsigned int)((char *)&((T *)0)->F))
+#endif
+
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
#define UNUSED __attribute__((unused))
#else
}
#define X264_BFRAME_MAX 16
+#define X264_THREAD_MAX 16
#define X264_SLICE_MAX 4
#define X264_NAL_MAX (4 + X264_SLICE_MAX)
+#define X264_THREAD_HEIGHT 24 // number of pixels (per thread) in progress at any given time. could theoretically be as low as 22
/****************************************************************************
* Includes
/* encoder parameters */
x264_param_t param;
- x264_t *thread[X264_SLICE_MAX];
+ x264_t *thread[X264_THREAD_MAX];
+ pthread_t thread_handle;
+ int b_thread_active;
+ int i_thread_phase; /* which thread to use for the next frame */
/* bitstream output */
struct
int i_bitstream; /* size of p_bitstream */
uint8_t *p_bitstream; /* will hold data for all nal */
bs_t bs;
+ int i_frame_size;
} out;
/* frame number/poc */
/* Temporary buffer (frames types not yet decided) */
x264_frame_t *next[X264_BFRAME_MAX+3];
/* Unused frames */
- x264_frame_t *unused[X264_BFRAME_MAX+3];
+ x264_frame_t *unused[X264_BFRAME_MAX + X264_THREAD_MAX*2 + 16+4];
/* For adaptive B decision */
x264_frame_t *last_nonb;
int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
+ uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
/* current value */
int i_type;
/* XXX: both omit the cost of MBs coded as P_SKIP */
int i_intra_cost;
int i_inter_cost;
+ int i_mbs_analysed;
/* Adaptive direct mv pred */
int i_direct_score[2];
} frame;
frame->i_pts = -1;
frame->i_frame = -1;
frame->i_frame_num = -1;
+ frame->i_lines_completed = -1;
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv )
+static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
int y;
memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
}
/* upper band */
+ if( b_pad_top )
for( y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
/* lower band */
+ if( b_pad_bottom )
for( y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
#undef PPIXEL
}
-void x264_frame_expand_border( x264_t *h, x264_frame_t *frame )
+void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
int i;
+ int b_start = !mb_y;
+ if( mb_y & h->sh.b_mbaff )
+ return;
for( i = 0; i < frame->i_plane; i++ )
{
int stride = frame->i_stride[i];
int width = 16*h->sps->i_mb_width >> !!i;
- int height = 16*h->sps->i_mb_height >> !!i;
+ int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
int padh = PADH >> !!i;
int padv = PADV >> !!i;
- if( h->param.b_interlaced )
+ // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
+ uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
+ if( b_end && !b_start )
+ height += 4 >> (!!i + h->sh.b_mbaff);
+ if( h->sh.b_mbaff )
{
- plane_expand_border( frame->plane[i], stride*2, width, height>>1, padh, padv );
- plane_expand_border( frame->plane[i]+stride, stride*2, width, height>>1, padh, padv );
+ plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
+ plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
}
else
{
- plane_expand_border( frame->plane[i], stride, width, height, padh, padv );
+ plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
}
}
}
-void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame )
+void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
/* during filtering, 8 extra pixels were filtered on each edge.
we want to expand border from the last filtered pixel */
+ int b_start = !mb_y;
int stride = frame->i_stride[0];
- int width = 16*h->sps->i_mb_width;
- int height = 16*h->sps->i_mb_height;
+ int width = 16*h->sps->i_mb_width + 16;
+ int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
int padh = PADH - 8;
int padv = PADV - 8;
int i;
for( i = 1; i < 4; i++ )
{
- if( h->param.b_interlaced )
+ // buffer: 8 luma, to match the hpel filter
+ uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 8;
+ if( h->sh.b_mbaff )
{
- plane_expand_border( frame->filtered[i] - 16*stride - 8, stride*2, width+16, (height>>1)+16, padh, padv );
- plane_expand_border( frame->filtered[i] - 15*stride - 8, stride*2, width+16, (height>>1)+16, padh, padv );
+ plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
+ plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
}
else
{
- plane_expand_border( frame->filtered[i] - 8*stride - 8, stride, width+16, height+16, padh, padv );
+ plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
}
}
}
{
int i;
for( i = 0; i < 4; i++ )
- plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV );
+ plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
}
}
-void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
+void x264_frame_deblock_row( x264_t *h, int mb_y )
{
const int s8x8 = 2 * h->mb.i_mb_stride;
const int s4x4 = 4 * h->mb.i_mb_stride;
- const int b_interlaced = h->param.b_interlaced;
+ const int b_interlaced = h->sh.b_mbaff;
const int mvy_limit = 4 >> b_interlaced;
- int mb_y, mb_x;
+ int mb_x;
int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced,
h->fdec->i_stride[1] << b_interlaced,
h->fdec->i_stride[2] << b_interlaced };
- for( mb_y = 0, mb_x = 0; mb_y < h->sps->i_mb_height; )
+ for( mb_x = 0; mb_x < h->sps->i_mb_width; )
{
const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
bS[i] = 0;
- for( l = 0; l < 1 + (i_slice_type == SLICE_TYPE_B); l++ )
+ for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )
{
if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||
abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||
/* next mb */
if( !b_interlaced || (mb_y&1) )
mb_x++;
- if( mb_x >= h->sps->i_mb_width )
- {
- mb_x = 0;
- mb_y++;
- }
- else
- mb_y ^= b_interlaced;
+ mb_y ^= b_interlaced;
}
}
+void x264_frame_deblock( x264_t *h )
+{
+ int mb_y;
+ for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
+ x264_frame_deblock_row( h, mb_y );
+}
+
#ifdef HAVE_MMXEXT
void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
int *i_row_bits;
int *i_row_qp;
+ /* threading */
+ int i_lines_completed; /* in pixels */
+ int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
+
} x264_frame_t;
typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
-void x264_frame_expand_border( x264_t *h, x264_frame_t *frame );
-void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame );
+void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
+void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void x264_frame_expand_border_lowres( x264_frame_t *frame );
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
-void x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
+void x264_frame_deblock( x264_t *h );
+void x264_frame_deblock_row( x264_t *h, int mb_y );
-void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced );
+void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced, int mb_y, int b_end );
void x264_frame_init_lowres( int cpu, x264_frame_t *frame );
void x264_deblock_init( int cpu, x264_deblock_function_t *pf );
ALIGN 16
.vertical_filter:
- prefetchnta [src3 + stride*2 + 32]
+ prefetcht0 [src3 + stride*2 + 32]
LOAD_ADD mm1, [src ], [src3 + stride*2 ] ; a0
LOAD_ADD mm2, [src + stride ], [src3 + stride ] ; b0
}
}
+ if( h->param.i_threads > 1 )
+ {
+ int di = b8x8 ? 4 : 1;
+ for( i4=0; i4<16; i4+=di )
+ {
+ if( h->mb.cache.mv[0][x264_scan8[i4]][1] > h->mb.mv_max_spel[1]
+ || h->mb.cache.mv[1][x264_scan8[i4]][1] > h->mb.mv_max_spel[1] )
+ {
+#if 0
+ fprintf(stderr, "direct_temporal: (%d,%d) (%d,%d) > %d \n",
+ h->mb.cache.mv[0][x264_scan8[i4]][0],
+ h->mb.cache.mv[0][x264_scan8[i4]][1],
+ h->mb.cache.mv[1][x264_scan8[i4]][0],
+ h->mb.cache.mv[1][x264_scan8[i4]][1],
+ h->mb.mv_max_spel[1]);
+#endif
+ return 0;
+ }
+ }
+ }
+
return 1;
}
if( IS_INTRA( type_col ) )
return 1;
+
+ if( h->param.i_threads > 1
+ && ( mv[0][1] > h->mb.mv_max_spel[1]
+ || mv[1][1] > h->mb.mv_max_spel[1] ) )
+ {
+#if 0
+ fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
+ mv[0][0], mv[0][1], mv[1][0], mv[1][1],
+ h->mb.mv_max_spel[1]);
+#endif
+ return 0;
+ }
+
b8x8 = h->sps->b_direct8x8_inference ||
(type_col != P_8x8 && type_col != B_SKIP && type_col != B_DIRECT && type_col != B_8x8);
CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
}
+ for( i=0; i<=h->param.b_interlaced; i++ )
+ for( j=0; j<3; j++ )
+ {
+ CHECKED_MALLOC( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
+ h->mb.intra_border_backup[i][j] += 8;
+ }
+
/* init with not available (for top right idx=7,15) */
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
void x264_macroblock_cache_end( x264_t *h )
{
int i, j;
+ for( i=0; i<=h->param.b_interlaced; i++ )
+ for( j=0; j<3; j++ )
+ x264_free( h->mb.intra_border_backup[i][j] - 8 );
for( i=0; i<2; i++ )
{
int i_refs = i ? 1 + h->param.b_bframe_pyramid : h->param.i_frame_reference;
: w * (i_mb_x + i_mb_y * i_stride);
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k, l;
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
&h->fenc->plane[i][i_pix_offset], i_stride2, w );
- memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], &plane_fdec[-1-i_stride2], w*3/2+1 );
+ memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
-void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced )
+void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced, int mb_y, int b_end )
{
const int x_inc = 16, y_inc = 16;
const int stride = frame->i_stride[0] << b_interlaced;
- const int height = frame->i_lines[0] >> b_interlaced;
+ const int start = (mb_y*16 >> b_interlaced) - 8;
+ const int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
int x, y;
- pf_mc_t int_h = mc_hh;
- pf_mc_t int_v = mc_hv;
- pf_mc_t int_hv = mc_hc;
+ if( mb_y & b_interlaced )
+ return;
+ mb_y >>= b_interlaced;
#ifdef HAVE_MMXEXT
if ( cpu & X264_CPU_MMXEXT )
{
- int offs = -8*stride - 8;
+ // buffer = 4 for deblock + 3 for 6tap, rounded to 8
+ int offs = start*stride - 8;
x264_hpel_filter_mmxext(
frame->filtered[1] + offs,
frame->filtered[2] + offs,
frame->filtered[3] + offs,
frame->plane[0] + offs,
- stride, stride - 48, height + 16);
+ stride, stride - 48, height - start );
}
else
#endif
{
- for( y = -8; y < height + 8; y += y_inc )
+ for( y = start; y < height; y += y_inc )
{
uint8_t *p_in = frame->plane[0] + y * stride - 8;
uint8_t *p_h = frame->filtered[1] + y * stride - 8;
uint8_t *p_v = frame->filtered[2] + y * stride - 8;
- uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
+ uint8_t *p_c = frame->filtered[3] + y * stride - 8;
for( x = -8; x < stride - 64 + 8; x += x_inc )
{
- int_h( p_in, stride, p_h, stride, x_inc, y_inc );
- int_v( p_in, stride, p_v, stride, x_inc, y_inc );
- int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
+ mc_hh( p_in, stride, p_h, stride, x_inc, y_inc );
+ mc_hv( p_in, stride, p_v, stride, x_inc, y_inc );
+ mc_hc( p_in, stride, p_c, stride, x_inc, y_inc );
p_h += x_inc;
p_v += x_inc;
- p_hv += x_inc;
+ p_c += x_inc;
p_in += x_inc;
}
}
* the sum of an 8x8 pixel region with top-left corner on that point.
* in the lower plane, 4x4 sums (needed only with --analyse p4x4). */
- if( frame->integral )
+ if( frame->integral && b_end )
{
+ //FIXME slice
memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
for( y = -32; y < frame->i_lines[0] + 31; y++ )
{
--- /dev/null
+Old threading method: slice-based
+application calls x264
+x264 runs B-adapt and ratecontrol (serial)
+split frame into several slices, and spawn a thread for each slice
+wait until all threads are done
+deblock and hpel filter (serial)
+return to application
+In x264cli, there is one additional thread to decode the input.
+
+New threading method: frame-based
+application calls x264
+x264 runs B-adapt and ratecontrol (serial to the application, but parallel to the other x264 threads)
+spawn a thread for this frame
+thread runs encode in 1 slice, deblock, hpel filter
+meanwhile x264 waits for the oldest thread to finish
+return to application, but the rest of the threads continue running in the background
+No additional threads are needed to decode the input, unless decoding+B-adapt is slower than slice+deblock+hpel, in which case an additional input thread would allow decoding in parallel to B-adapt.
+
+
+Penalties for slice-based threading:
+Each slice adds some bitrate (or equivalently reduces quality), for a variety of reasons: the slice header costs some bits, cabac contexts are reset, mvs and intra samples can't be predicted across the slice boundary.
+In CBR mode, we have to allocate bits between slices before encoding them, which may lead to uneven quality.
+Some parts of the encoder are serial, so it doesn't scale well with lots of cpus.
+
+Penalties for frame-base threading:
+To allow encoding of multiple frames in parallel, we have to ensure that any given macroblock uses motion vectors only from pieces of the reference frames that have been encoded already. This is usually not noticeable, but can matter for very fast upward motion.
+We have to commit to one frame type before starting on the frame. Thus scenecut detection must run during the lowres pre-motion-estimation along with B-adapt, which makes it faster but less accurate than re-encoding the whole frame.
+Ratecontrol gets delayed feedback, since it has to plan frame N before frame N-1 finishes.
+
+
+Benchmarks:
+cpu: 4x woodcrest 3GHz
+content: 480p
+
+x264 -B1000 -b2 -m1 -Anone
+threads speed psnr
+ old new old new
+1: 1.000x 1.000x 0.000 0.000
+2: 1.168x 1.413x -0.038 -0.007
+3: 1.208x 1.814x -0.064 -0.005
+4: 1.293x 2.329x -0.095 -0.006
+5: 2.526x -0.007
+6: 2.658x -0.001
+7: 2.723x -0.018
+8: 2.712x -0.019
+
+x264 -B1000 -b2 -m5
+threads speed psnr
+ old new old new
+1: 1.000x 1.000x 0.000 0.000
+2: 1.319x 1.517x -0.036 -0.006
+3: 1.466x 2.013x -0.068 -0.005
+4: 1.578x 2.741x -0.101 -0.004
+5: 3.022x -0.015
+6: 3.221x -0.014
+7: 3.331x -0.020
+8: 3.425x -0.025
+
+x264 -B1000 -b2 -m6 -r3 -8 --b-rdo
+threads speed psnr
+ old new old new
+1: 1.000x 1.000x 0.000 0.000
+2: 1.531x 1.707x -0.032 -0.006
+3: 1.866x 2.277x -0.061 -0.005
+4: 2.097x 3.204x -0.088 -0.006
+5: 3.468x -0.013
+6: 3.629x -0.010
+7: 3.716x -0.014
+8: 3.745x -0.018
+
#include <string.h>
#include <math.h>
#include <limits.h>
+#include <unistd.h>
#include "common/common.h"
#include "macroblock.h"
/* II: Inter part P/B frame */
if( h->sh.i_type != SLICE_TYPE_I )
{
- int i;
- int i_fmv_range = h->param.analyse.i_mv_range - 16;
+ int i, j;
+ int i_fmv_range = 4 * h->param.analyse.i_mv_range;
+ int i_fpel_border = 5; // 3 for hex search, 2 for subpel, ignores subme7 & bime
/* Calculate max allowed MV range */
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
- h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
- h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 8 );
- h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
- h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
+ h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
+ h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
+ h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
+ h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
if( h->mb.i_mb_x == 0)
{
int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
+ int thread_mvy_range = i_fmv_range;
+
+ if( h->param.i_threads > 1 )
+ {
+ int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
+ int thresh = pix_y + h->param.analyse.i_mv_range_thread;
+ for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
+ {
+ x264_frame_t **fref = i ? h->fref1 : h->fref0;
+ int i_ref = i ? h->i_ref1 : h->i_ref0;
+ for( j=0; j<i_ref; j++ )
+ {
+ // could use a condition variable or the like, but
+ // this way is faster at least on LinuxThreads.
+ while( fref[j]->i_lines_completed < thresh )
+ usleep(100);
+ thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
+ }
+ }
+ if( h->param.b_deterministic )
+ thread_mvy_range = h->param.analyse.i_mv_range_thread;
+ if( h->mb.b_interlaced )
+ thread_mvy_range >>= 1;
+ }
+
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
- h->mb.mv_min_fpel[1] = CLIP_FMV( -16*mb_y - 8 );
- h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( mb_height - mb_y - 1 ) + 8 );
- h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
- h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
+ h->mb.mv_min_spel[1] = CLIP_FMV( h->mb.mv_min[1] );
+ h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
+ h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
+ h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
+ h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
#undef CLIP_FMV
{
h->mb.i_type = P_SKIP;
x264_analyse_update_cache( h, a );
+ assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
return;
}
}
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
+ assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
h->mb.i_type = P_L0;
if( a->b_mbrd && a->l0.i_ref == 0
analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip )
{
- if( h->param.analyse.i_subpel_refine >= 3 )
+ if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
+ // FIXME don't need to check this if the reference frame is done
+ {}
+ else if( h->param.analyse.i_subpel_refine >= 3 )
analysis.b_try_pskip = 1;
else if( h->mb.i_mb_type_left == P_SKIP ||
h->mb.i_mb_type_top == P_SKIP ||
{
h->mb.i_type = P_SKIP;
h->mb.i_partition = D_16x16;
+ assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
}
else
{
h->mb.i_type = i_type;
h->stat.frame.i_intra_cost += i_intra_cost;
h->stat.frame.i_inter_cost += i_cost;
+ h->stat.frame.i_mbs_analysed++;
if( h->mb.i_subpel_refine >= 7 )
{
break;
}
}
+
+#ifndef NDEBUG
+ if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
+ {
+ int l;
+ for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+ {
+ int completed;
+ int ref = h->mb.cache.ref[l][x264_scan8[0]];
+ if( ref < 0 )
+ continue;
+ completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
+ if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
+ {
+ fprintf(stderr, "mb type: %d \n", h->mb.i_type);
+ fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
+ h->mb.cache.mv[l][x264_scan8[15]][0],
+ h->mb.cache.mv[l][x264_scan8[15]][1] );
+ fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
+ fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
+ fprintf(stderr, "completed: %d \n", completed );
+ assert(0);
+ }
+ }
+ }
+#endif
}
#include "slicetype.c"
#define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
+static x264_frame_t *x264_frame_get( x264_frame_t **list ); //FIXME move
+static void x264_frame_put( x264_frame_t **list, x264_frame_t *frame );
+static void x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
+static void x264_frame_put_unused( x264_t *h, x264_frame_t *frame );
+static x264_frame_t *x264_frame_get_unused( x264_t *h );
+
+static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+ x264_nal_t **pp_nal, int *pi_nal,
+ x264_picture_t *pic_out );
+
/****************************************************************************
*
******************************* x264 libs **********************************
/* Fill "default" values */
static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
x264_sps_t *sps, x264_pps_t *pps,
- int i_type, int i_idr_pic_id, int i_frame, int i_qp )
+ int i_idr_pic_id, int i_frame, int i_qp )
{
x264_param_t *param = &h->param;
int i;
sh->sps = sps;
sh->pps = pps;
- sh->i_type = i_type;
sh->i_first_mb = 0;
sh->i_last_mb = h->sps->i_mb_width * h->sps->i_mb_height;
sh->i_pps_id = pps->i_id;
}
if( h->param.i_threads == 0 )
- h->param.i_threads = x264_cpu_num_processors();
- h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_SLICE_MAX );
- h->param.i_threads = X264_MIN( h->param.i_threads, (h->param.i_height + 15) >> (4 + h->param.b_interlaced) );
-#ifndef HAVE_PTHREAD
+ h->param.i_threads = x264_cpu_num_processors() * 3/2;
+ h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+ h->param.i_threads = X264_MIN( h->param.i_threads, 1 + (h->param.i_height >> h->param.b_interlaced) / (X264_THREAD_HEIGHT + 16) ); // FIXME exact limit?
if( h->param.i_threads > 1 )
{
+#ifndef HAVE_PTHREAD
x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
- x264_log( h, X264_LOG_WARNING, "multislicing anyway, but you won't see any speed gain.\n" );
- }
+ h->param.i_threads = 1;
+#else
+ if( h->param.analyse.i_me_method == X264_ME_ESA )
+ {
+ x264_log( h, X264_LOG_WARNING, "threads are not yet compatible with ESA\n");
+ h->param.analyse.i_me_method = X264_ME_UMH;
+ }
+ if( h->param.i_scenecut_threshold >= 0 )
+ h->param.b_pre_scenecut = 1;
#endif
+ }
if( h->param.b_interlaced )
{
h->param.analyse.i_direct_8x8_inference = l->direct8x8;
}
+ if( h->param.i_threads > 1 )
+ {
+ int r = h->param.analyse.i_mv_range_thread;
+ int r2;
+ if( r <= 0 )
+ {
+ // half of the available space is reserved and divided evenly among the threads,
+ // the rest is allocated to whichever thread is far enough ahead to use it.
+ // reserving more space increases quality for some videos, but costs more time
+ // in thread synchronization.
+ int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->param.i_threads - X264_THREAD_HEIGHT;
+ r = max_range / 2;
+ }
+ r = X264_MAX( r, h->param.analyse.i_me_range );
+ r = X264_MIN( r, h->param.analyse.i_mv_range );
+ // round up to use the whole mb row
+ r2 = (r & ~15) + ((-X264_THREAD_HEIGHT) & 15);
+ if( r2 < r )
+ r2 += 16;
+ x264_log( h, X264_LOG_DEBUG, "using mv_range_thread = %d\n", r2 );
+ h->param.analyse.i_mv_range_thread = r2;
+ }
+
if( h->param.rc.f_qblur < 0 )
h->param.rc.f_qblur = 0;
if( h->param.rc.f_complexity_blur < 0 )
x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
/* Init x264_t */
- h->out.i_nal = 0;
- h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 1.7
- * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.5, h->param.rc.i_qp_min )
- : pow( 0.5, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
- h->out.p_bitstream = x264_malloc( h->out.i_bitstream );
-
h->i_frame = 0;
h->i_frame_num = 0;
h->i_idr_pic_id = 0;
h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
/* Init frames. */
- h->frames.i_delay = h->param.i_bframe;
+ h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
h->frames.i_max_ref0 = h->param.i_frame_reference;
h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering + 1;
|| h->param.rc.i_rc_method == X264_RC_CRF
|| h->param.b_bframe_adaptive );
- for( i = 0; i < X264_BFRAME_MAX + 3; i++ )
- {
- h->frames.current[i] = NULL;
- h->frames.next[i] = NULL;
- h->frames.unused[i] = NULL;
- }
- for( i = 0; i < 1 + h->frames.i_delay; i++ )
- {
- h->frames.unused[i] = x264_frame_new( h );
- if( !h->frames.unused[i] )
- return NULL;
- }
- for( i = 0; i < h->frames.i_max_dpb; i++ )
- {
- h->frames.reference[i] = x264_frame_new( h );
- if( !h->frames.reference[i] )
- return NULL;
- }
- h->frames.reference[h->frames.i_max_dpb] = NULL;
h->frames.i_last_idr = - h->param.i_keyint_max;
h->frames.i_input = 0;
h->frames.last_nonb = NULL;
h->i_ref0 = 0;
h->i_ref1 = 0;
- h->fdec = h->frames.reference[0];
-
- if( x264_macroblock_cache_init( h ) < 0 )
- return NULL;
x264_rdo_init( );
/* init CPU functions */
mbcmp_init( h );
- /* rate control */
- if( x264_ratecontrol_new( h ) < 0 )
- return NULL;
-
x264_log( h, X264_LOG_INFO, "using cpu capabilities %s%s%s%s%s%s\n",
param->cpu&X264_CPU_MMX ? "MMX " : "",
param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
+ h->out.i_nal = 0;
+ h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 1.7
+ * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.5, h->param.rc.i_qp_min )
+ : pow( 0.5, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
+
h->thread[0] = h;
h->i_thread_num = 0;
for( i = 1; i < h->param.i_threads; i++ )
h->thread[i] = x264_malloc( sizeof(x264_t) );
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ if( i > 0 )
+ *h->thread[i] = *h;
+ h->thread[i]->fdec = x264_frame_get_unused( h );
+ h->thread[i]->out.p_bitstream = x264_malloc( h->out.i_bitstream );
+ if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
+ return NULL;
+ }
+
+ if( x264_ratecontrol_new( h ) < 0 )
+ return NULL;
+
#ifdef DEBUG_DUMP_FRAME
{
/* create or truncate the reconstructed video file */
}
-static void x264_frame_put( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *frame )
+static void x264_frame_put( x264_frame_t **list, x264_frame_t *frame )
{
int i = 0;
while( list[i] ) i++;
list[i] = frame;
}
-static void x264_frame_push( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *frame )
+static void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
{
int i = 0;
while( list[i] ) i++;
list[0] = frame;
}
-static x264_frame_t *x264_frame_get( x264_frame_t *list[X264_BFRAME_MAX+1] )
+static x264_frame_t *x264_frame_get( x264_frame_t **list )
{
x264_frame_t *frame = list[0];
int i;
for( i = 0; list[i]; i++ )
list[i] = list[i+1];
+ assert(frame);
return frame;
}
-static void x264_frame_sort( x264_frame_t *list[X264_BFRAME_MAX+1], int b_dts )
+static void x264_frame_put_unused( x264_t *h, x264_frame_t *frame )
+{
+ assert( frame->i_reference_count > 0 );
+ frame->i_reference_count--;
+ if( frame->i_reference_count == 0 )
+ x264_frame_put( h->frames.unused, frame );
+ assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
+}
+
+static x264_frame_t *x264_frame_get_unused( x264_t *h )
+{
+ x264_frame_t *frame;
+ if( h->frames.unused[0] )
+ frame = x264_frame_get( h->frames.unused );
+ else
+ frame = x264_frame_new( h );
+ assert( frame->i_reference_count == 0 );
+ frame->i_reference_count = 1;
+ return frame;
+}
+
+static void x264_frame_sort( x264_frame_t **list, int b_dts )
{
int i, b_ok;
do {
#define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
#define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
-static inline void x264_reference_build_list( x264_t *h, int i_poc, int i_slice_type )
+static inline void x264_reference_build_list( x264_t *h, int i_poc )
{
int i;
int b_ok;
/* build ref list 0/1 */
h->i_ref0 = 0;
h->i_ref1 = 0;
- for( i = 1; i < h->frames.i_max_dpb; i++ )
+ for( i = 0; h->frames.reference[i]; i++ )
{
- if( h->frames.reference[i]->i_poc >= 0 )
+ if( h->frames.reference[i]->i_poc < i_poc )
{
- if( h->frames.reference[i]->i_poc < i_poc )
- {
- h->fref0[h->i_ref0++] = h->frames.reference[i];
- }
- else if( h->frames.reference[i]->i_poc > i_poc )
- {
- h->fref1[h->i_ref1++] = h->frames.reference[i];
- }
+ h->fref0[h->i_ref0++] = h->frames.reference[i];
+ }
+ else if( h->frames.reference[i]->i_poc > i_poc )
+ {
+ h->fref1[h->i_ref1++] = h->frames.reference[i];
}
}
* We use POC, but check whether explicit reordering is needed */
h->b_ref_reorder[0] =
h->b_ref_reorder[1] = 0;
- if( i_slice_type == SLICE_TYPE_P )
+ if( h->sh.i_type == SLICE_TYPE_P )
{
for( i = 0; i < h->i_ref0 - 1; i++ )
if( h->fref0[i]->i_frame_num < h->fref0[i+1]->i_frame_num )
h->mb.pic.i_fref[1] = h->i_ref1;
}
-static inline void x264_fdec_deblock( x264_t *h )
+static void x264_fdec_filter_row( x264_t *h, int mb_y )
{
- /* apply deblocking filter to the current decoded picture */
- if( !h->sh.i_disable_deblocking_filter_idc )
+ /* mb_y is the mb to be encoded next, not the mb to be filtered here */
+ int b_hpel = h->fdec->b_kept_as_ref;
+ int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
+ int b_end = mb_y == h->sps->i_mb_height;
+ int min_y = mb_y - (1 << h->sh.b_mbaff);
+#ifndef DEBUG_DUMP_FRAME
+ b_deblock &= b_hpel;
+#endif
+ if( mb_y & h->sh.b_mbaff )
+ return;
+ if( min_y < 0 )
+ return;
+
+ if( !b_end )
+ {
+ int i, j;
+ for( j=0; j<=h->sh.b_mbaff; j++ )
+ for( i=0; i<3; i++ )
+ {
+ memcpy( h->mb.intra_border_backup[j][i],
+ h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
+ h->sps->i_mb_width*16 >> !!i );
+ }
+ }
+
+ if( b_deblock )
{
- TIMER_START( i_mtime_filter );
- x264_frame_deblocking_filter( h, h->sh.i_type );
- TIMER_STOP( i_mtime_filter );
+ int max_y = b_end ? h->sps->i_mb_height : mb_y;
+ int y;
+ for( y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) )
+ x264_frame_deblock_row( h, y );
+ }
+
+ if( b_hpel )
+ {
+ x264_frame_expand_border( h, h->fdec, min_y, b_end );
+ x264_frame_filter( h->param.cpu, h->fdec, h->sh.b_mbaff, min_y, b_end );
+ x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
+ }
+
+ if( h->param.i_threads > 1 )
+ {
+ /* this must be an atomic store. a 32bit int should be so on sane architectures. */
+ h->fdec->i_lines_completed = mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff));
}
}
{
int i;
- x264_fdec_deblock( h );
-
- /* expand border */
- x264_frame_expand_border( h, h->fdec );
+ if( h->fdec->i_frame >= 0 )
+ h->i_frame++;
- /* create filtered images */
- x264_frame_filter( h->param.cpu, h->fdec, h->sh.b_mbaff );
-
- /* expand border of filtered images */
- x264_frame_expand_border_filtered( h, h->fdec );
+ if( !h->fdec->b_kept_as_ref )
+ {
+ if( h->param.i_threads > 1 )
+ {
+ x264_frame_put_unused( h, h->fdec );
+ h->fdec = x264_frame_get_unused( h );
+ }
+ return;
+ }
/* move lowres copy of the image to the ref frame */
for( i = 0; i < 4; i++)
h->frames.last_nonb = h->fdec;
/* move frame in the buffer */
- h->fdec = h->frames.reference[h->frames.i_max_dpb-1];
- for( i = h->frames.i_max_dpb-1; i > 0; i-- )
- {
- h->frames.reference[i] = h->frames.reference[i-1];
- }
- h->frames.reference[0] = h->fdec;
+ x264_frame_put( h->frames.reference, h->fdec );
+ if( h->frames.reference[h->frames.i_max_dpb] )
+ x264_frame_put_unused( h, x264_frame_get( h->frames.reference ) );
+ h->fdec = x264_frame_get_unused( h );
}
static inline void x264_reference_reset( x264_t *h )
{
- int i;
-
- /* reset ref pictures */
- for( i = 1; i < h->frames.i_max_dpb; i++ )
- {
- h->frames.reference[i]->i_poc = -1;
- }
- h->frames.reference[0]->i_poc = 0;
+ while( h->frames.reference[0] )
+ x264_frame_put_unused( h, x264_frame_get( h->frames.reference ) );
+ h->fdec->i_poc =
+ h->fenc->i_poc = 0;
}
-static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_slice_type, int i_global_qp )
+static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
{
/* ------------------------ Create slice header ----------------------- */
if( i_nal_type == NAL_SLICE_IDR )
{
- x264_slice_header_init( h, &h->sh, h->sps, h->pps, i_slice_type, h->i_idr_pic_id, h->i_frame_num, i_global_qp );
+ x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp );
/* increment id */
h->i_idr_pic_id = ( h->i_idr_pic_id + 1 ) % 65536;
}
else
{
- x264_slice_header_init( h, &h->sh, h->sps, h->pps, i_slice_type, -1, h->i_frame_num, i_global_qp );
+ x264_slice_header_init( h, &h->sh, h->sps, h->pps, -1, h->i_frame_num, i_global_qp );
/* always set the real higher num of ref frame used */
h->sh.b_num_ref_idx_override = 1;
{
const int i_mb_y = mb_xy / h->sps->i_mb_width;
const int i_mb_x = mb_xy % h->sps->i_mb_width;
-
int mb_spos = bs_pos(&h->out.bs);
+ if( i_mb_x == 0 )
+ x264_fdec_filter_row( h, i_mb_y );
+
/* load cache */
x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
return 0;
}
-static inline int x264_slices_write( x264_t *h )
+static void x264_thread_sync_context( x264_t *dst, x264_t *src )
+{
+ x264_frame_t **f;
+ if( dst == src )
+ return;
+
+ // reference counting
+ for( f = src->frames.reference; *f; f++ )
+ (*f)->i_reference_count++;
+ for( f = dst->frames.reference; *f; f++ )
+ x264_frame_put_unused( src, *f );
+ src->fdec->i_reference_count++;
+ x264_frame_put_unused( src, dst->fdec );
+
+ // copy everything except the per-thread pointers and the constants.
+ memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
+ memcpy( &dst->mb.i_type, &src->mb.i_type, offsetof(x264_t, rc) - offsetof(x264_t, mb.i_type) );
+ dst->stat = src->stat;
+}
+
+static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
+{
+ if( dst == src )
+ return;
+ memcpy( &dst->stat.i_slice_count, &src->stat.i_slice_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
+}
+
+static int x264_slices_write( x264_t *h )
{
int i_frame_size;
x264_visualize_init( h );
#endif
- if( h->param.i_threads == 1 )
- {
- x264_ratecontrol_threads_start( h );
- x264_slice_write( h );
- i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
- }
- else
- {
- int i_nal = h->out.i_nal;
- int i_bs_size = h->out.i_bitstream / h->param.i_threads;
- int i;
- /* duplicate contexts */
- for( i = 0; i < h->param.i_threads; i++ )
- {
- x264_t *t = h->thread[i];
- int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
- int mb_width = h->sps->i_mb_width << h->sh.b_mbaff;
- if( i > 0 )
- {
- memcpy( t, h, sizeof(x264_t) );
- t->out.p_bitstream += i*i_bs_size;
- bs_init( &t->out.bs, t->out.p_bitstream, i_bs_size );
- t->i_thread_num = i;
- }
- t->sh.i_first_mb = (i * mb_height / h->param.i_threads) * mb_width;
- t->sh.i_last_mb = ((i+1) * mb_height / h->param.i_threads) * mb_width;
- t->out.i_nal = i_nal + i;
- }
- x264_ratecontrol_threads_start( h );
-
- /* dispatch */
-#ifdef HAVE_PTHREAD
- {
- pthread_t handles[X264_SLICE_MAX];
- for( i = 0; i < h->param.i_threads; i++ )
- pthread_create( &handles[i], NULL, (void*)x264_slice_write, (void*)h->thread[i] );
- for( i = 0; i < h->param.i_threads; i++ )
- pthread_join( handles[i], NULL );
- }
-#else
- for( i = 0; i < h->param.i_threads; i++ )
- x264_slice_write( h->thread[i] );
-#endif
-
- /* merge contexts */
- i_frame_size = h->out.nal[i_nal].i_payload;
- for( i = 1; i < h->param.i_threads; i++ )
- {
- int j;
- x264_t *t = h->thread[i];
- h->out.nal[i_nal+i] = t->out.nal[i_nal+i];
- i_frame_size += t->out.nal[i_nal+i].i_payload;
- // all entries in stat.frame are ints
- for( j = 0; j < sizeof(h->stat.frame) / sizeof(int); j++ )
- ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
- }
- h->out.i_nal = i_nal + h->param.i_threads;
- }
+ x264_slice_write( h );
+ i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
+ x264_fdec_filter_row( h, h->sps->i_mb_height );
#if VISUALIZE
if( h->param.b_visualize )
}
#endif
- return i_frame_size;
+ h->out.i_frame_size = i_frame_size;
+ return 0;
}
/****************************************************************************
x264_picture_t *pic_in,
x264_picture_t *pic_out )
{
- x264_frame_t *frame_psnr = h->fdec; /* just to keep the current decoded frame for psnr calculation */
+ x264_t *thread_current, *thread_prev, *thread_oldest;
int i_nal_type;
int i_nal_ref_idc;
- int i_slice_type;
- int i_frame_size;
-
- int i;
int i_global_qp;
- char psz_message[80];
+ if( h->param.i_threads > 1)
+ {
+ int i = ++h->i_thread_phase;
+ int t = h->param.i_threads;
+ thread_current = h->thread[ i%t ];
+ thread_prev = h->thread[ (i-1)%t ];
+ thread_oldest = h->thread[ (i+1)%t ];
+ x264_thread_sync_context( thread_current, thread_prev );
+ x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest );
+ h = thread_current;
+// fprintf(stderr, "current: %p prev: %p oldest: %p \n", thread_current, thread_prev, thread_oldest);
+ }
+ else
+ {
+ thread_current =
+ thread_prev =
+ thread_oldest = h;
+ }
+
+ // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
+ x264_reference_update( h );
+ h->fdec->i_lines_completed = -1;
/* no data out */
*pi_nal = 0;
*pp_nal = NULL;
-
/* ------------------- Setup new frame from picture -------------------- */
TIMER_START( i_mtime_encode_frame );
if( pic_in != NULL )
{
/* 1: Copy the picture to a frame and move it to a buffer */
- x264_frame_t *fenc = x264_frame_get( h->frames.unused );
+ x264_frame_t *fenc = x264_frame_get_unused( h );
x264_frame_copy_picture( h, fenc, pic_in );
if( h->frames.b_have_lowres )
x264_frame_init_lowres( h->param.cpu, fenc );
- if( h->frames.i_input <= h->frames.i_delay )
+ if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
{
/* Nothing yet to encode */
/* waiting for filling bframe buffer */
int bframes = 0;
/* 2: Select frame types */
if( h->frames.next[0] == NULL )
+ {
+ x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
return 0;
+ }
x264_slicetype_decide( h );
i_nal_type = NAL_SLICE_IDR;
i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
- i_slice_type = SLICE_TYPE_I;
+ h->sh.i_type = SLICE_TYPE_I;
}
else if( h->fenc->i_type == X264_TYPE_I )
{
i_nal_type = NAL_SLICE;
i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
- i_slice_type = SLICE_TYPE_I;
+ h->sh.i_type = SLICE_TYPE_I;
}
else if( h->fenc->i_type == X264_TYPE_P )
{
i_nal_type = NAL_SLICE;
i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
- i_slice_type = SLICE_TYPE_P;
+ h->sh.i_type = SLICE_TYPE_P;
}
else if( h->fenc->i_type == X264_TYPE_BREF )
{
i_nal_type = NAL_SLICE;
i_nal_ref_idc = NAL_PRIORITY_HIGH; /* maybe add MMCO to forget it? -> low */
- i_slice_type = SLICE_TYPE_B;
+ h->sh.i_type = SLICE_TYPE_B;
}
else /* B frame */
{
i_nal_type = NAL_SLICE;
i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE;
- i_slice_type = SLICE_TYPE_B;
+ h->sh.i_type = SLICE_TYPE_B;
}
h->fdec->i_poc =
h->fdec->i_type = h->fenc->i_type;
h->fdec->i_frame = h->fenc->i_frame;
h->fenc->b_kept_as_ref =
- h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE;
+ h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1;
/* ------------------- Init ----------------------------- */
/* build ref list 0/1 */
- x264_reference_build_list( h, h->fdec->i_poc, i_slice_type );
+ x264_reference_build_list( h, h->fdec->i_poc );
/* Init the rate control */
- x264_ratecontrol_start( h, i_slice_type, h->fenc->i_qpplus1 );
+ x264_ratecontrol_start( h, h->fenc->i_qpplus1 );
i_global_qp = x264_ratecontrol_qp( h );
pic_out->i_qpplus1 =
h->fdec->i_qpplus1 = i_global_qp + 1;
- if( i_slice_type == SLICE_TYPE_B )
+ if( h->sh.i_type == SLICE_TYPE_B )
x264_macroblock_bipred_init( h );
/* ------------------------ Create slice header ----------------------- */
- x264_slice_init( h, i_nal_type, i_slice_type, i_global_qp );
+ x264_slice_init( h, i_nal_type, i_global_qp );
- if( h->fenc->b_kept_as_ref )
+ if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
h->i_frame_num++;
/* ---------------------- Write the bitstream -------------------------- */
if(h->param.b_aud){
int pic_type;
- if(i_slice_type == SLICE_TYPE_I)
+ if(h->sh.i_type == SLICE_TYPE_I)
pic_type = 0;
- else if(i_slice_type == SLICE_TYPE_P)
+ else if(h->sh.i_type == SLICE_TYPE_P)
pic_type = 1;
- else if(i_slice_type == SLICE_TYPE_B)
+ else if(h->sh.i_type == SLICE_TYPE_B)
pic_type = 2;
else
pic_type = 7;
}
/* Write frame */
- i_frame_size = x264_slices_write( h );
+ if( h->param.i_threads > 1 )
+ {
+ pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h );
+ h->b_thread_active = 1;
+ }
+ else
+ x264_slices_write( h );
/* restore CPU state (before using float again) */
x264_cpu_restore( h->param.cpu );
- if( i_slice_type == SLICE_TYPE_P && !h->param.rc.b_stat_read
- && h->param.i_scenecut_threshold >= 0 )
+ if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read
+ && h->param.i_scenecut_threshold >= 0
+ && !h->param.b_pre_scenecut )
{
const int *mbs = h->stat.frame.i_mb_count;
int i_mb_i = mbs[I_16x16] + mbs[I_8x8] + mbs[I_4x4];
/* macroblock_analyse() doesn't further analyse skipped mbs,
* so we have to guess their cost */
- if( i_mb_s < i_mb )
- i_intra_cost = i_intra_cost * i_mb / (i_mb - i_mb_s);
+ if( h->stat.frame.i_mbs_analysed > 0 )
+ i_intra_cost = i_intra_cost * i_mb / h->stat.frame.i_mbs_analysed;
if( i_gop_size < h->param.i_keyint_min / 4 )
f_bias = f_thresh_min / 4;
f_bias = X264_MIN( f_bias, 1.0 );
/* Bad P will be reencoded as I */
- if( i_mb_s < i_mb &&
+ if( h->stat.frame.i_mbs_analysed > 0 &&
i_inter_cost >= (1.0 - f_bias) * i_intra_cost )
{
int b;
- x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%.0f Pcost:%.0f ratio:%.3f bias=%.3f lastIDR:%d (I:%d P:%d S:%d)\n",
+ x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%.0f Pcost:%.0f ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d smb:%d)\n",
h->fenc->i_frame,
(double)i_intra_cost, (double)i_inter_cost,
- (double)i_inter_cost / i_intra_cost,
+ 1. - (double)i_inter_cost / i_intra_cost,
f_bias, i_gop_size,
i_mb_i, i_mb_p, i_mb_s );
/* Do IDR if needed */
else if( i_gop_size >= h->param.i_keyint_min )
{
- x264_frame_t *tmp;
-
/* Reset */
h->i_frame_num = 0;
h->fenc->i_poc = 0;
/* Put enqueued frames back in the pool */
- while( (tmp = x264_frame_get( h->frames.current ) ) != NULL )
- x264_frame_put( h->frames.next, tmp );
+ while( h->frames.current[0] )
+ x264_frame_put( h->frames.next, x264_frame_get( h->frames.current ) );
x264_frame_sort_pts( h->frames.next );
}
else
}
}
+ x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
+ return 0;
+}
+
+static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+ x264_nal_t **pp_nal, int *pi_nal,
+ x264_picture_t *pic_out )
+{
+ int i;
+ char psz_message[80];
+
+ if( h->b_thread_active )
+ {
+ pthread_join( h->thread_handle, NULL );
+ h->b_thread_active = 0;
+ }
+ if( !h->out.i_nal )
+ {
+ pic_out->i_type = X264_TYPE_AUTO;
+ return;
+ }
+
+ x264_frame_put_unused( thread_current, h->fenc );
+
/* End bitstream, set output */
*pi_nal = h->out.i_nal;
*pp_nal = h->out.nal;
+ h->out.i_nal = 0;
/* Set output picture properties */
- if( i_slice_type == SLICE_TYPE_I )
- pic_out->i_type = i_nal_type == NAL_SLICE_IDR ? X264_TYPE_IDR : X264_TYPE_I;
- else if( i_slice_type == SLICE_TYPE_P )
+ if( h->sh.i_type == SLICE_TYPE_I )
+ pic_out->i_type = h->i_nal_type == NAL_SLICE_IDR ? X264_TYPE_IDR : X264_TYPE_I;
+ else if( h->sh.i_type == SLICE_TYPE_P )
pic_out->i_type = X264_TYPE_P;
else
pic_out->i_type = X264_TYPE_B;
/* update rc */
x264_cpu_restore( h->param.cpu );
- x264_ratecontrol_end( h, i_frame_size * 8 );
-
- /* handle references */
- if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1 )
- x264_reference_update( h );
-#ifdef DEBUG_DUMP_FRAME
- else
- x264_fdec_deblock( h );
-#endif
- x264_frame_put( h->frames.unused, h->fenc );
-
- /* increase frame count */
- h->i_frame++;
+ x264_ratecontrol_end( h, h->out.i_frame_size * 8 );
/* restore CPU state (before using float again) */
x264_cpu_restore( h->param.cpu );
TIMER_STOP( i_mtime_encode_frame );
/* ---------------------- Compute/Print statistics --------------------- */
+ x264_thread_sync_stat( h, h->thread[0] );
+
/* Slice stat */
- h->stat.i_slice_count[i_slice_type]++;
- h->stat.i_slice_size[i_slice_type] += i_frame_size + NALU_OVERHEAD;
- h->stat.i_slice_qp[i_slice_type] += i_global_qp;
+ h->stat.i_slice_count[h->sh.i_type]++;
+ h->stat.i_slice_size[h->sh.i_type] += h->out.i_frame_size + NALU_OVERHEAD;
+ h->stat.i_slice_qp[h->sh.i_type] += h->fdec->i_qpplus1 - 1;
for( i = 0; i < X264_MBTYPE_MAX; i++ )
h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
for( i = 0; i < 32; i++ )
h->stat.i_mb_count_ref[h->sh.i_type][i] += h->stat.frame.i_mb_count_ref[i];
}
- if( i_slice_type == SLICE_TYPE_B )
+ if( h->sh.i_type == SLICE_TYPE_B )
{
h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
if( h->mb.b_direct_auto_write )
for( i=0; i<3; i++ )
{
sqe[i] = x264_pixel_ssd_wxh( &h->pixf,
- frame_psnr->plane[i], frame_psnr->i_stride[i],
+ h->fdec->plane[i], h->fdec->i_stride[i],
h->fenc->plane[i], h->fenc->i_stride[i],
h->param.i_width >> !!i, h->param.i_height >> !!i );
}
x264_cpu_restore( h->param.cpu );
- h->stat.i_sqe_global[i_slice_type] += sqe[0] + sqe[1] + sqe[2];
- h->stat.f_psnr_average[i_slice_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
- h->stat.f_psnr_mean_y[i_slice_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height );
- h->stat.f_psnr_mean_u[i_slice_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 );
- h->stat.f_psnr_mean_v[i_slice_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 );
+ h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2];
+ h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
+ h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height );
+ h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 );
+ h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 );
snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f",
x264_psnr( sqe[0], h->param.i_width * h->param.i_height ),
{
// offset by 2 pixels to avoid alignment of ssim blocks with dct blocks
float ssim_y = x264_pixel_ssim_wxh( &h->pixf,
- frame_psnr->plane[0] + 2+2*frame_psnr->i_stride[0], frame_psnr->i_stride[0],
+ h->fdec->plane[0] + 2+2*h->fdec->i_stride[0], h->fdec->i_stride[0],
h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0],
h->param.i_width-2, h->param.i_height-2 );
- h->stat.f_ssim_mean_y[i_slice_type] += ssim_y;
+ h->stat.f_ssim_mean_y[h->sh.i_type] += ssim_y;
snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
" SSIM Y:%.5f", ssim_y );
}
x264_log( h, X264_LOG_DEBUG,
"frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
- h->i_frame - 1,
- i_global_qp,
- i_nal_ref_idc,
- i_slice_type == SLICE_TYPE_I ? 'I' : (i_slice_type == SLICE_TYPE_P ? 'P' : 'B' ),
- frame_psnr->i_poc,
+ h->i_frame,
+ h->fdec->i_qpplus1 - 1,
+ h->i_nal_ref_idc,
+ h->sh.i_type == SLICE_TYPE_I ? 'I' : (h->sh.i_type == SLICE_TYPE_P ? 'P' : 'B' ),
+ h->fdec->i_poc,
h->stat.frame.i_mb_count_i,
h->stat.frame.i_mb_count_p,
h->stat.frame.i_mb_count_skip,
- i_frame_size,
+ h->out.i_frame_size,
psz_message );
+ // keep stats all in one place
+ x264_thread_sync_stat( h->thread[0], h );
+ // for the use of the next frame
+ x264_thread_sync_stat( thread_current, h );
#ifdef DEBUG_MB_TYPE
{
#ifdef DEBUG_DUMP_FRAME
/* Dump reconstructed frame */
- x264_frame_dump( h, frame_psnr, "fdec.yuv" );
+ x264_frame_dump( h, h->fdec, "fdec.yuv" );
#endif
- return 0;
}
/****************************************************************************
int64_t i_yuv_size = 3 * h->param.i_width * h->param.i_height / 2;
int i;
+ for( i=0; i<h->param.i_threads; i++ )
+ {
+ // don't strictly have to wait for the other threads, but it's simpler than cancelling them
+ if( h->thread[i]->b_thread_active )
+ pthread_join( h->thread[i]->thread_handle, NULL );
+ }
+
#ifdef DEBUG_BENCHMARK
x264_log( h, X264_LOG_INFO,
"analyse=%d(%lldms) encode=%d(%lldms) write=%d(%lldms) filter=%d(%lldms)\n",
}
/* frames */
- for( i = 0; i < X264_BFRAME_MAX + 3; i++ )
- {
- if( h->frames.current[i] ) x264_frame_delete( h->frames.current[i] );
- if( h->frames.next[i] ) x264_frame_delete( h->frames.next[i] );
- if( h->frames.unused[i] ) x264_frame_delete( h->frames.unused[i] );
- }
- /* ref frames */
- for( i = 0; i < h->frames.i_max_dpb; i++ )
- {
+ for( i = 0; h->frames.current[i]; i++ )
+ x264_frame_delete( h->frames.current[i] );
+ for( i = 0; h->frames.next[i]; i++ )
+ x264_frame_delete( h->frames.next[i] );
+ for( i = 0; h->frames.unused[i]; i++ )
+ x264_frame_delete( h->frames.unused[i] );
+ for( i = 0; h->frames.reference[i]; i++ )
x264_frame_delete( h->frames.reference[i] );
- }
/* rc */
x264_ratecontrol_delete( h );
free( h->param.rc.psz_rc_eq );
x264_cqm_delete( h );
- x264_macroblock_cache_end( h );
- x264_free( h->out.p_bitstream );
- for( i = 1; i < h->param.i_threads; i++ )
+ for( i = h->param.i_threads - 1; i >= 0; i-- )
+ {
+ x264_macroblock_cache_end( h->thread[i] );
+ x264_free( h->thread[i]->out.p_bitstream );
x264_free( h->thread[i] );
- x264_free( h );
+ }
}
int mv_x_max = h->mb.mv_max_fpel[0];
int mv_y_max = h->mb.mv_max_fpel[1];
+#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
+
const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
- if( h->mb.i_me_method == X264_ME_UMH )
- {
- /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */
- p_cost_mvx = m->p_cost_mv - x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
- p_cost_mvy = m->p_cost_mv - x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
- }
-
bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
pmx = ( bmx + 2 ) >> 2;
COST_MV( 0, 0 );
- mv_x_max += 8;
- mv_y_max += 8;
- mv_x_min -= 8;
- mv_y_min -= 8;
-
switch( h->mb.i_me_method )
{
case X264_ME_DIA:
DIA1_ITER( bmx, bmy );
if( bmx == omx && bmy == omy )
break;
+ if( !CHECK_MVRANGE(bmx, bmy) )
+ break;
}
break;
COST_MV( omx-1, omy-2 );
if( bmx == omx && bmy == omy )
break;
+ if( !CHECK_MVRANGE(bmx, bmy) )
+ break;
}
#else
/* equivalent to the above, but eliminates duplicate candidates */
bmx += hex2[dir+1][0];
bmy += hex2[dir+1][1];
/* half hexagon, not overlapping the previous iteration */
- for( i = 1; i < i_me_range/2; i++ )
+ for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ )
{
static const int mod6[8] = {5,0,1,2,3,4,5,0};
const int odir = mod6[dir+1];
{
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
- if( mx >= mv_x_min && mx <= mv_x_max
- && my >= mv_y_min && my <= mv_y_max )
+ if( CHECK_MVRANGE(mx, my) )
COST_MV( mx, my );
}
}
m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 )
m->cost += m->cost_mv;
-
+
/* subpel refine */
if( h->mb.i_subpel_refine >= 2 )
{
int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
}
+ else if( m->mv[1] > h->mb.mv_max_spel[1] )
+ m->mv[1] = h->mb.mv_max_spel[1];
}
#undef COST_MV
if( !b_refine_qpel )
{
+ /* check for mvrange */
+ if( bmy > h->mb.mv_max_spel[1] )
+ bmy = h->mb.mv_max_spel[1];
bcost = COST_MAX;
COST_MV_SATD( bmx, bmy, -1 );
}
break;
}
+ /* check for mvrange */
+ if( bmy > h->mb.mv_max_spel[1] )
+ {
+ bmy = h->mb.mv_max_spel[1];
+ bcost = COST_MAX;
+ COST_MV_SATD( bmx, bmy, -1 );
+ }
+
m->cost = bcost;
m->mv[0] = bmx;
m->mv[1] = bmy;
BIME_CACHE( 0, 0 );
CHECK_BIDIR( 0, 0, 0, 0 );
+ if( bm0y > h->mb.mv_max_spel[1] - 8 ||
+ bm1y > h->mb.mv_max_spel[1] - 8 )
+ return bcost;
+
for( pass = 0; pass < 8; pass++ )
{
/* check all mv pairs that differ in at most 2 components from the current mvs. */
bdir = -1;
for( i = 0; i < 2; i++ )
{
+ if( bmy > h->mb.mv_max_spel[1] - 2 )
+ break;
omx = bmx;
omy = bmy;
odir = bdir;
bdir = -1;
for( i = 0; i < 2; i++ )
{
+ if( bmy > h->mb.mv_max_spel[1] - 1 )
+ break;
omx = bmx;
omy = bmy;
odir = bdir;
break;
}
+ if( bmy > h->mb.mv_max_spel[1] )
+ bmy = h->mb.mv_max_spel[1];
+
m->cost = bcost;
m->mv[0] = bmx;
m->mv[1] = bmy;
int qp; /* qp for current frame */
int qpm; /* qp for current macroblock */
float qpa; /* average of macroblocks' qp */
- int slice_type;
int qp_force;
/* VBV stuff */
double buffer_size;
- double buffer_fill;
+ double buffer_fill_final; /* real buffer as of the last finished frame */
+ double buffer_fill; /* planned buffer, if all in-progress frames hit their bit budget */
double buffer_rate; /* # of bits added to buffer_fill after each frame */
- predictor_t pred[5]; /* predict frame size from satd */
+ predictor_t *pred; /* predict frame size from satd */
/* ABR stuff */
int last_satd;
/* MBRC stuff */
double frame_size_planned;
- int first_row, last_row; /* region of the frame to be encoded by this thread */
predictor_t *row_pred;
predictor_t row_preds[5];
- predictor_t pred_b_from_p; /* predict B-frame size from P-frame satd */
+ predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
int bframes; /* # consecutive B-frames before this P-frame */
int bframe_bits; /* total cost of those frames */
static int parse_zones( x264_t *h );
static int init_pass2(x264_t *);
-static float rate_estimate_qscale( x264_t *h, int pict_type );
+static float rate_estimate_qscale( x264_t *h );
static void update_vbv( x264_t *h, int bits );
+static void update_vbv_plan( x264_t *h );
static double predict_size( predictor_t *p, double q, double var );
static void update_predictor( predictor_t *p, double q, double var, double bits );
int x264_rc_analyse_slice( x264_t *h );
x264_cpu_restore( h->param.cpu );
- h->rc = rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) );
+ rc = h->rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) );
memset( rc, 0, h->param.i_threads * sizeof(x264_ratecontrol_t) );
rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
}
rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000 / rc->fps;
rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000;
- rc->buffer_fill = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
+ rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
* 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
rc->b_vbv = 1;
rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
rc->last_qscale = qp2qscale(26);
+ rc->pred = x264_malloc( 5*sizeof(predictor_t) );
+ rc->pred_b_from_p = x264_malloc( sizeof(predictor_t) );
for( i = 0; i < 5; i++ )
{
rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
rc->row_preds[i].count= 1.0;
rc->row_preds[i].decay= 0.5;
}
- rc->pred_b_from_p = rc->pred[0];
+ *rc->pred_b_from_p = rc->pred[0];
if( parse_zones( h ) < 0 )
return -1;
x264_free( p );
}
+ for( i=1; i<h->param.i_threads; i++ )
+ {
+ h->thread[i]->rc = rc+i;
+ rc[i] = rc[0];
+ }
+
return 0;
}
}
x264_free( rc->psz_stat_file_tmpname );
}
+ x264_free( rc->pred );
+ x264_free( rc->pred_b_from_p );
x264_free( rc->entry );
x264_free( rc->zones );
x264_free( rc );
}
+static void accum_p_qp_update( x264_t *h, float qp )
+{
+ x264_ratecontrol_t *rc = h->rc;
+ rc->accum_p_qp *= .95;
+ rc->accum_p_norm *= .95;
+ rc->accum_p_norm += 1;
+ if( h->sh.i_type == SLICE_TYPE_I )
+ rc->accum_p_qp += qp + rc->ip_offset;
+ else
+ rc->accum_p_qp += qp;
+}
+
/* Before encoding a frame, choose a QP for it */
-void x264_ratecontrol_start( x264_t *h, int i_slice_type, int i_force_qp )
+void x264_ratecontrol_start( x264_t *h, int i_force_qp )
{
x264_ratecontrol_t *rc = h->rc;
ratecontrol_entry_t *rce = NULL;
+ float q;
x264_cpu_restore( h->param.cpu );
rc->qp_force = i_force_qp;
- rc->slice_type = i_slice_type;
if( h->param.rc.b_stat_read )
{
assert( frame >= 0 && frame < rc->num_entries );
rce = h->rc->rce = &h->rc->entry[frame];
- if( i_slice_type == SLICE_TYPE_B
+ if( h->sh.i_type == SLICE_TYPE_B
&& h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
{
h->sh.b_direct_spatial_mv_pred = ( rce->direct_mode == 's' );
}
}
- if( h->fdec->i_row_bits )
+ if( rc->b_vbv )
{
memset( h->fdec->i_row_bits, 0, h->sps->i_mb_height * sizeof(int) );
+ rc->row_pred = &rc->row_preds[h->sh.i_type];
+ update_vbv_plan( h );
}
- if( i_slice_type != SLICE_TYPE_B )
+ if( h->sh.i_type != SLICE_TYPE_B )
{
- rc->bframe_bits = 0;
rc->bframes = 0;
while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) )
rc->bframes++;
if( i_force_qp )
{
- rc->qpm = rc->qp = i_force_qp - 1;
+ q = i_force_qp - 1;
}
else if( rc->b_abr )
{
- rc->qpm = rc->qp =
- x264_clip3( (int)(qscale2qp( rate_estimate_qscale( h, i_slice_type ) ) + .5), 0, 51 );
+ q = qscale2qp( rate_estimate_qscale( h ) );
}
else if( rc->b_2pass )
{
- rce->new_qscale = rate_estimate_qscale( h, i_slice_type );
- rc->qpm = rc->qp = rce->new_qp =
- x264_clip3( (int)(qscale2qp(rce->new_qscale) + 0.5), 0, 51 );
+ rce->new_qscale = rate_estimate_qscale( h );
+ q = qscale2qp( rce->new_qscale );
}
else /* CQP */
{
x264_zone_t *zone = get_zone( h, h->fenc->i_frame );
- float q;
- if( i_slice_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref )
+ if( h->sh.i_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref )
q = ( rc->qp_constant[ SLICE_TYPE_B ] + rc->qp_constant[ SLICE_TYPE_P ] ) / 2;
else
- q = rc->qp_constant[ i_slice_type ];
+ q = rc->qp_constant[ h->sh.i_type ];
if( zone )
{
else
q -= 6*log(zone->f_bitrate_factor)/log(2);
}
-
- rc->qpm = rc->qp = (int)(q + 0.5);
}
+
+ h->fdec->f_qp_avg =
+ rc->qpm =
+ rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
+ if( rce )
+ rce->new_qp = rc->qp;
+
+ /* accum_p_qp needs to be here so that future frames can benefit from the
+ * data before this frame is done. but this only works because threading
+ * guarantees to not re-encode any frames. so the non-threaded case does
+ * accum_p_qp later. */
+ if( h->param.i_threads > 1 )
+ accum_p_qp_update( h, rc->qp );
+
+ if( h->sh.i_type != SLICE_TYPE_B )
+ rc->last_non_b_pict_type = h->sh.i_type;
}
double predict_row_size( x264_t *h, int y, int qp )
x264_ratecontrol_t *rc = h->rc;
double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] );
double pred_t = 0;
- if( rc->slice_type != SLICE_TYPE_I
+ if( h->sh.i_type != SLICE_TYPE_I
&& h->fref0[0]->i_type == h->fdec->i_type
&& h->fref0[0]->i_row_satd[y] > 0 )
{
{
int i;
double bits = 0;
- for( i = h->rc->first_row; i <= y; i++ )
+ for( i = 0; i <= y; i++ )
bits += h->fdec->i_row_bits[i];
- for( i = y+1; i <= h->rc->last_row; i++ )
+ for( i = y+1; i < h->sps->i_mb_height; i++ )
bits += predict_row_size( h, i, qp );
return bits;
}
h->fdec->i_row_qp[y] = rc->qpm;
- if( rc->slice_type == SLICE_TYPE_B )
+ if( h->sh.i_type == SLICE_TYPE_B )
{
/* B-frames shouldn't use lower QP than their reference frames */
- if( y < rc->last_row )
+ if( y < h->sps->i_mb_height-1 )
{
rc->qpm = X264_MAX( rc->qp,
X264_MIN( h->fref0[0]->i_row_qp[y+1],
update_predictor( rc->row_pred, qp2qscale(rc->qpm), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
/* tweak quality based on difference from predicted size */
- if( y < rc->last_row && h->stat.i_slice_count[rc->slice_type] > 0 )
+ if( y < h->sps->i_mb_height-1 && h->stat.i_slice_count[h->sh.i_type] > 0 )
{
int prev_row_qp = h->fdec->i_row_qp[y];
int b0 = predict_row_size_sum( h, y, rc->qpm );
h->stat.frame.i_mb_count_p += mbs[i];
if( h->mb.b_variable_qp )
- {
- for( i = 1; i < h->param.i_threads; i++ )
- rc->qpa += rc[i].qpa;
rc->qpa /= h->mb.i_mb_count;
- }
else
rc->qpa = rc->qp;
h->fdec->f_qp_avg = rc->qpa;
if( h->param.rc.b_stat_write )
{
- char c_type = rc->slice_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i')
- : rc->slice_type==SLICE_TYPE_P ? 'P'
+ char c_type = h->sh.i_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i')
+ : h->sh.i_type==SLICE_TYPE_P ? 'P'
: h->fenc->b_kept_as_ref ? 'B' : 'b';
int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0];
int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0];
if( rc->b_abr )
{
- if( rc->slice_type != SLICE_TYPE_B )
+ if( h->sh.i_type != SLICE_TYPE_B )
rc->cplxr_sum += bits * qp2qscale(rc->qpa) / rc->last_rceq;
else
{
rc->wanted_bits_window += rc->bitrate / rc->fps;
rc->wanted_bits_window *= rc->cbr_decay;
- rc->accum_p_qp *= .95;
- rc->accum_p_norm *= .95;
- rc->accum_p_norm += 1;
- if( rc->slice_type == SLICE_TYPE_I )
- rc->accum_p_qp += rc->qpa * fabs(h->param.rc.f_ip_factor);
- else
- rc->accum_p_qp += rc->qpa;
+ if( h->param.i_threads == 1 )
+ accum_p_qp_update( h, rc->qpa );
}
if( rc->b_2pass )
if( h->mb.b_variable_qp )
{
- if( rc->slice_type == SLICE_TYPE_B )
+ if( h->sh.i_type == SLICE_TYPE_B )
{
rc->bframe_bits += bits;
if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) )
- update_predictor( &rc->pred_b_from_p, qp2qscale(rc->qpa), h->fref1[0]->i_satd, rc->bframe_bits / rc->bframes );
- }
- else
- {
- /* Update row predictor based on data collected by other threads. */
- int y;
- for( y = rc->last_row+1; y < h->sps->i_mb_height; y++ )
- update_predictor( rc->row_pred, qp2qscale(h->fdec->i_row_qp[y]), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
- rc->row_preds[rc->slice_type] = *rc->row_pred;
+ {
+ update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa),
+ h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
+ rc->bframe_bits = 0;
+ }
}
}
update_vbv( h, bits );
-
- if( rc->slice_type != SLICE_TYPE_B )
- rc->last_non_b_pict_type = rc->slice_type;
}
/****************************************************************************
p->coeff += bits*q / var;
}
+// update VBV after encoding a frame
static void update_vbv( x264_t *h, int bits )
{
x264_ratecontrol_t *rcc = h->rc;
+ x264_ratecontrol_t *rct = h->thread[0]->rc;
if( rcc->last_satd >= h->mb.i_mb_count )
- update_predictor( &rcc->pred[rcc->slice_type], qp2qscale(rcc->qpa), rcc->last_satd, bits );
+ update_predictor( &rct->pred[h->sh.i_type], qp2qscale(rcc->qpa), rcc->last_satd, bits );
if( !rcc->b_vbv )
return;
- rcc->buffer_fill += rcc->buffer_rate - bits;
- if( rcc->buffer_fill < 0 && !rcc->b_2pass )
- x264_log( h, X264_LOG_WARNING, "VBV underflow (%.0f bits)\n", rcc->buffer_fill );
- rcc->buffer_fill = x264_clip3( rcc->buffer_fill, 0, rcc->buffer_size );
+ rct->buffer_fill_final += rct->buffer_rate - bits;
+ if( rct->buffer_fill_final < 0 && !rct->b_2pass )
+ x264_log( h, X264_LOG_WARNING, "VBV underflow (%.0f bits)\n", rct->buffer_fill_final );
+ rct->buffer_fill_final = x264_clip3( rct->buffer_fill_final, 0, rct->buffer_size );
+}
+
+// provisionally update VBV according to the planned size of all frames currently in progress
+static void update_vbv_plan( x264_t *h )
+{
+ x264_ratecontrol_t *rcc = h->rc;
+ rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
+ if( h->param.i_threads > 1 )
+ {
+ int j = h->rc - h->thread[0]->rc;
+ int i;
+ for( i=1; i<h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+ double bits = t->rc->frame_size_planned;
+ if( !t->b_thread_active )
+ continue;
+ rcc->buffer_fill += rcc->buffer_rate - bits;
+ rcc->buffer_fill = x264_clip3( rcc->buffer_fill, 0, rcc->buffer_size );
+ }
+ }
}
// apply VBV constraints and clip qscale to between lmin and lmax
{
/* Now a hard threshold to make sure the frame fits in VBV.
* This one is mostly for I-frames. */
- double bits = predict_size( &rcc->pred[rcc->slice_type], q, rcc->last_satd );
+ double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
double qf = 1.0;
if( bits > rcc->buffer_fill/2 )
qf = x264_clip3f( rcc->buffer_fill/(2*bits), 0.2, 1.0 );
/* Check B-frame complexity, and use up any bits that would
* overflow before the next P-frame. */
- if( rcc->slice_type == SLICE_TYPE_P )
+ if( h->sh.i_type == SLICE_TYPE_P )
{
int nb = rcc->bframes;
double pbbits = bits;
- double bbits = predict_size( &rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
+ double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
double space;
if( bbits > rcc->buffer_rate )
}
// update qscale for 1 frame based on actual bits used so far
-static float rate_estimate_qscale(x264_t *h, int pict_type)
+static float rate_estimate_qscale( x264_t *h )
{
float q;
x264_ratecontrol_t *rcc = h->rc;
ratecontrol_entry_t rce;
+ int pict_type = h->sh.i_type;
double lmin = rcc->lmin[pict_type];
double lmax = rcc->lmax[pict_type];
int64_t total_bits = 8*(h->stat.i_slice_size[SLICE_TYPE_I]
else
q += rcc->pb_offset;
+ rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
rcc->last_satd = 0;
return qp2qscale(q);
}
* tradeoff between quality and bitrate precision. But at large
* tolerances, the bit distribution approaches that of 2pass. */
- double wanted_bits, overflow, lmin, lmax;
+ double wanted_bits, overflow=1, lmin, lmax;
rcc->last_satd = x264_rc_analyse_slice( h );
rcc->short_term_cplxsum *= 0.5;
if( h->param.rc.i_rc_method == X264_RC_CRF )
{
q = get_qscale( h, &rce, rcc->rate_factor_constant, h->fenc->i_frame );
- overflow = 1;
}
else
{
+ int i_frame_done = h->fenc->i_frame + 1 - h->param.i_threads;
+
q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
- wanted_bits = h->fenc->i_frame * rcc->bitrate / rcc->fps;
- abr_buffer *= X264_MAX( 1, sqrt(h->fenc->i_frame/25) );
- overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
- q *= overflow;
+ // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
+ wanted_bits = i_frame_done * rcc->bitrate / rcc->fps;
+ if( wanted_bits > 0 )
+ {
+ abr_buffer *= X264_MAX( 1, sqrt(i_frame_done/25) );
+ overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
+ q *= overflow;
+ }
}
if( pict_type == SLICE_TYPE_I && h->param.i_keyint_max > 1
if( !rcc->b_2pass && h->fenc->i_frame == 0 )
rcc->last_qscale_for[SLICE_TYPE_P] = q;
- rcc->frame_size_planned = predict_size( &rcc->pred[rcc->slice_type], q, rcc->last_satd );
-
+ rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
return q;
}
}
-/* Distribute bits among the slices, proportional to their estimated complexity */
-void x264_ratecontrol_threads_start( x264_t *h )
+void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
{
- x264_ratecontrol_t *rc = h->rc;
- int t, y;
- double den = 0;
- double frame_size_planned = rc->frame_size_planned;
-
- for( t = 0; t < h->param.i_threads; t++ )
+ if( cur != prev )
{
- h->thread[t]->rc = &rc[t];
- if( t > 0 )
- rc[t] = rc[0];
+#define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
+ /* these vars are updated in x264_ratecontrol_start()
+ * so copy them from the context that most recently started (prev)
+ * to the context that's about to start (cur).
+ */
+ COPY(accum_p_qp);
+ COPY(accum_p_norm);
+ COPY(last_satd);
+ COPY(last_rceq);
+ COPY(last_qscale_for);
+ COPY(last_non_b_pict_type);
+ COPY(short_term_cplxsum);
+ COPY(short_term_cplxcount);
+ COPY(bframes);
+#undef COPY
}
-
- if( !h->mb.b_variable_qp || rc->slice_type == SLICE_TYPE_B )
- return;
-
- for( t = 0; t < h->param.i_threads; t++ )
+ if( cur != next )
{
- rc[t].first_row = h->thread[t]->sh.i_first_mb / h->sps->i_mb_width;
- rc[t].last_row = (h->thread[t]->sh.i_last_mb-1) / h->sps->i_mb_width;
- rc[t].frame_size_planned = 1;
- rc[t].row_pred = &rc[t].row_preds[rc->slice_type];
- if( h->param.i_threads > 1 )
- {
- for( y = rc[t].first_row; y<= rc[t].last_row; y++ )
- rc[t].frame_size_planned += predict_row_size( h, y, qscale2qp(rc[t].qp) );
- }
- den += rc[t].frame_size_planned;
+#define COPY(var) next->rc->var = cur->rc->var
+ /* these vars are updated in x264_ratecontrol_end()
+ * so copy them from the context that most recently ended (cur)
+ * to the context that's about to end (next)
+ */
+ COPY(cplxr_sum);
+ COPY(expected_bits_sum);
+ COPY(wanted_bits_window);
+ COPY(bframe_bits);
+#undef COPY
}
- for( t = 0; t < h->param.i_threads; t++ )
- rc[t].frame_size_planned *= frame_size_planned / den;
+ //FIXME row_preds[] (not strictly necessary, but would improve prediction)
+ /* the rest of the variables are either constant or thread-local */
}
static int init_pass2( x264_t *h )
rce->expected_bits = expected_bits;
expected_bits += bits;
update_vbv(h, bits);
+ rcc->buffer_fill = rcc->buffer_fill_final;
}
//printf("expected:%llu available:%llu factor:%lf avgQ:%lf\n", (uint64_t)expected_bits, all_available_bits, rate_factor);
int x264_ratecontrol_new ( x264_t * );
void x264_ratecontrol_delete( x264_t * );
-void x264_ratecontrol_start( x264_t *, int i_slice_type, int i_force_qp );
-void x264_ratecontrol_threads_start( x264_t * );
+void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
+void x264_ratecontrol_start( x264_t *, int i_force_qp );
int x264_ratecontrol_slice_type( x264_t *, int i_frame );
void x264_ratecontrol_mb( x264_t *, int bits );
int x264_ratecontrol_qp( x264_t * );
int mvc[4][2] = {{0}}, i_mvc;
int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
i_mvc = 0;
+#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; }
if( i_mb_x > 0 )
- {
- mvc[i_mvc][0] = fenc_mv[-1][0];
- mvc[i_mvc][1] = fenc_mv[-1][1];
- i_mvc++;
- }
+ MVC(fenc_mv[-1]);
if( i_mb_y > 0 )
{
- mvc[i_mvc][0] = fenc_mv[-i_mb_stride][0];
- mvc[i_mvc][1] = fenc_mv[-i_mb_stride][1];
- i_mvc++;
+ MVC(fenc_mv[-i_mb_stride]);
if( i_mb_x < h->sps->i_mb_width - 1 )
- {
- mvc[i_mvc][0] = fenc_mv[-i_mb_stride+1][0];
- mvc[i_mvc][1] = fenc_mv[-i_mb_stride+1][1];
- i_mvc++;
- }
+ MVC(fenc_mv[-i_mb_stride+1]);
if( i_mb_x > 0 )
- {
- mvc[i_mvc][0] = fenc_mv[-i_mb_stride-1][0];
- mvc[i_mvc][1] = fenc_mv[-i_mb_stride-1][1];
- i_mvc++;
- }
+ MVC(fenc_mv[-i_mb_stride-1]);
}
+#undef MVC
m[l].mvp[0] = x264_median( mvc[0][0], mvc[1][0], mvc[2][0] );
m[l].mvp[1] = x264_median( mvc[0][1], mvc[1][1], mvc[2][1] );
x264_me_search( h, &m[l], mvc, i_mvc );
- i_bcost = X264_MIN( i_bcost, m[l].cost + 3 );
+ m[l].cost -= 2; // remove mvcost from skip mbs
+ if( m[l].mv[0] || m[l].mv[1] )
+ m[l].cost += 5;
+ i_bcost = X264_MIN( i_bcost, m[l].cost );
}
if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) )
if( i_bcost < i_cost_bak )
SAVE_MVS( m[0].mv, m[1].mv );
+ //FIXME intra part could be shared across multiple encodings of the frame
lowres_intra_mb:
+ if( !b_bidir ) // forbid intra-mbs in B-frames, because it's rare and not worth checking
{
uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
- int intra_penalty = 5 + 10 * b_bidir;
- int satds[4], i_icost;
+ const int intra_penalty = 5;
+ int satds[4], i_icost, b_intra;
- memcpy( pix-FDEC_STRIDE, src-i_stride, 9 );
+ memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
for( i=0; i<8; i++ )
pix[i*FDEC_STRIDE] = src[i*i_stride];
pix++;
satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
}
}
- i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ) + intra_penalty;
- if( i_icost < i_bcost )
+ i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] );
+
+ if( i_icost < i_bcost * 2 )
{
- i_bcost = i_icost;
- if( !b_bidir
- && i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
- && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1 )
+ DECLARE_ALIGNED( uint8_t, edge[33], 8 );
+ x264_predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+ for( i=3; i<9; i++ )
{
- fenc->i_intra_mbs[b-p0]++;
+ int satd;
+ h->predict_8x8[i]( pix, edge );
+ satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ i_icost = X264_MIN( i_icost, satd );
}
- if( p1 > p0+1 )
- i_bcost = i_bcost * 9 / 8; // arbitrary penalty for I-blocks in and after B-frames
+ }
+
+ i_icost += intra_penalty;
+ b_intra = i_icost < i_bcost;
+ if( b_intra )
+ i_bcost = i_icost;
+ if( i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
+ && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1 )
+ {
+ fenc->i_intra_mbs[b-p0] += b_intra;
+ fenc->i_cost_est[0][0] += i_icost;
}
}
#undef SAVE_MVS
int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
- x264_frame_t **frames, int p0, int p1, int b )
+ x264_frame_t **frames, int p0, int p1, int b,
+ int b_intra_penalty )
{
int i_score = 0;
- int dist_scale_factor = 128;
- int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
/* Check whether we already evaluated this frame
* If we have tried this frame as P, then we have also tried
* the preceding frames as B. (is this still true?) */
if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 )
- return frames[b]->i_cost_est[b-p0][p1-b];
-
- /* Init MVs so that we don't have to check edge conditions when loading predictors. */
- /* FIXME: not needed every time */
- memset( frames[b]->mv[0], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
- if( b != p1 )
- memset( frames[b]->mv[1], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
-
- if( b == p1 )
- frames[b]->i_intra_mbs[b-p0] = 0;
- if( p1 != p0 )
- dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
-
- /* the edge mbs seem to reduce the predictive quality of the
- * whole frame's score, but are needed for a spatial distribution. */
- if( h->param.rc.i_vbv_buffer_size )
{
- for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+ i_score = frames[b]->i_cost_est[b-p0][p1-b];
+ }
+ else
+ {
+ int dist_scale_factor = 128;
+ int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
+
+ /* Init MVs so that we don't have to check edge conditions when loading predictors. */
+ /* FIXME: not needed every time */
+ memset( frames[b]->mv[0], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
+ if( b != p1 )
+ memset( frames[b]->mv[1], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
+
+ if( b == p1 )
{
- row_satd[ h->mb.i_mb_y ] = 0;
- for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
+ frames[b]->i_intra_mbs[b-p0] = 0;
+ frames[b]->i_cost_est[0][0] = 0;
+ }
+ if( p1 != p0 )
+ dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+ /* the edge mbs seem to reduce the predictive quality of the
+ * whole frame's score, but are needed for a spatial distribution. */
+ if( h->param.rc.i_vbv_buffer_size )
+ {
+ for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
{
- int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
- row_satd[ h->mb.i_mb_y ] += i_mb_cost;
- if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
- h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
+ row_satd[ h->mb.i_mb_y ] = 0;
+ for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
{
- i_score += i_mb_cost;
+ int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+ row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+ if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
+ h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
+ {
+ i_score += i_mb_cost;
+ }
}
}
}
+ else
+ {
+ for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ )
+ for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ )
+ i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+ }
+
+ if( b != p1 )
+ i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+
+ frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+// fprintf( stderr, "frm %d %c(%d,%d): %6d %6d imb:%d \n", frames[b]->i_frame,
+// (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_cost_est[0][0], frames[b]->i_intra_mbs[b-p0] );
+ x264_cpu_restore( h->param.cpu );
}
- else
+
+ if( b_intra_penalty )
{
- for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ )
- for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ )
- i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+ // arbitrary penalty for I-blocks after B-frames
+ int nmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2);
+ i_score += i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
}
+ return i_score;
+}
- if( b != p1 )
- i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+static int scenecut( x264_t *h, x264_frame_t *frame, int pdist )
+{
+ int icost = frame->i_cost_est[0][0];
+ int pcost = frame->i_cost_est[pdist][0];
+ float f_bias;
+ int i_gop_size = frame->i_frame - h->frames.i_last_idr;
+ float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
+ /* magic numbers pulled out of thin air */
+ float f_thresh_min = f_thresh_max * h->param.i_keyint_min
+ / ( h->param.i_keyint_max * 4 );
+ int res;
+
+ if( h->param.i_keyint_min == h->param.i_keyint_max )
+ f_thresh_min= f_thresh_max;
+ if( i_gop_size < h->param.i_keyint_min / 4 )
+ f_bias = f_thresh_min / 4;
+ else if( i_gop_size <= h->param.i_keyint_min )
+ f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min;
+ else
+ {
+ f_bias = f_thresh_min
+ + ( f_thresh_max - f_thresh_min )
+ * ( i_gop_size - h->param.i_keyint_min )
+ / ( h->param.i_keyint_max - h->param.i_keyint_min );
+ }
- frames[b]->i_cost_est[b-p0][p1-b] = i_score;
-// fprintf( stderr, "frm %d %c(%d,%d): %6d I:%d \n", frames[b]->i_frame,
-// (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_intra_mbs[b-p0] );
- x264_cpu_restore( h->param.cpu );
- return i_score;
+ res = pcost >= (1.0 - f_bias) * icost;
+ if( res )
+ {
+ int imb = frame->i_intra_mbs[pdist];
+ int pmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) - imb;
+ x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
+ frame->i_frame,
+ icost, pcost, 1. - (double)pcost / icost,
+ f_bias, i_gop_size, imb, pmb );
+ }
+ return res;
}
void x264_slicetype_analyse( x264_t *h )
int j;
int i_mb_count = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2);
int cost1p0, cost2p0, cost1b1, cost2p1;
+ int idr_frame_type;
if( !h->frames.last_nonb )
return;
num_frames = X264_MIN( j, keyint_limit );
if( num_frames == 0 )
return;
+
+ x264_lowres_context_init( h, &a );
+ idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+
if( num_frames == 1 )
{
no_b_frames:
frames[1]->i_type = X264_TYPE_P;
+ if( h->param.b_pre_scenecut && h->param.i_scenecut_threshold >= 0 )
+ {
+ x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
+ if( scenecut( h, frames[1], 1 ) )
+ frames[1]->i_type = idr_frame_type;
+ }
return;
}
- x264_lowres_context_init( h, &a );
-
- cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2 );
+ cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
goto no_b_frames;
- cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1 );
- cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1 );
- cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2 );
+ cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
+ cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
+ cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
// fprintf( stderr, "PP: %d + %d <=> BP: %d + %d \n",
// cost1p0, cost2p0, cost1b1, cost2p1 );
if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
{
int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
- int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1 );
+ int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
// fprintf( stderr, "frm%d+%d: %d <=> %d, I:%d/%d \n",
// frames[0]->i_frame, j-1, pthresh, pcost/i_mb_count,
// frames[j+1]->i_intra_mbs[j+1], i_mb_count );
frames[p0] = h->fref0[0];
frames[b] = h->fenc;
- cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b );
+ cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
h->fdec->i_satd = cost;
H0( " -I, --keyint <integer> Maximum GOP size [%d]\n", defaults->i_keyint_max );
H1( " -i, --min-keyint <integer> Minimum GOP size [%d]\n", defaults->i_keyint_min );
H1( " --scenecut <integer> How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
+ H1( " --pre-scenecut Faster, less precise scenecut detection.\n"
+ " Required and implied by multi-threading.\n" );
H0( " -b, --bframes <integer> Number of B-frames between I and P [%d]\n", defaults->i_bframe );
H1( " --no-b-adapt Disable adaptive B-frame decision\n" );
H1( " --b-bias <integer> Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
" - esa: exhaustive search (slow)\n" );
else H0( " - dia, hex, umh\n" );
H0( " --merange <integer> Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
+ H1( " --mvrange <integer> Maximum motion vector length [-1 (auto)]\n" );
+ H1( " --mvrange-thread <int> Minimum buffer between threads [-1 (auto)]\n" );
H0( " -m, --subme <integer> Subpixel motion estimation and partition\n"
" decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" );
H0( " --quiet Quiet Mode\n" );
H0( " --no-psnr Disable PSNR computation\n" );
H0( " --no-ssim Disable SSIM computation\n" );
- H0( " --threads <integer> Parallel encoding (uses slices)\n" );
+ H0( " --threads <integer> Parallel encoding\n" );
H0( " --thread-input Run Avisynth in its own thread\n" );
+ H1( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" );
H1( " --no-asm Disable all CPU optimizations\n" );
H1( " --visualize Show MB types overlayed on the encoded video\n" );
H1( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
{ "min-keyint",required_argument,NULL,'i' },
{ "keyint", required_argument, NULL, 'I' },
{ "scenecut",required_argument, NULL, 0 },
+ { "pre-scenecut", no_argument, NULL, 0 },
{ "nf", no_argument, NULL, 0 },
{ "no-deblock", no_argument, NULL, 0 },
{ "filter", required_argument, NULL, 0 },
{ "weightb", no_argument, NULL, 'w' },
{ "me", required_argument, NULL, 0 },
{ "merange", required_argument, NULL, 0 },
+ { "mvrange", required_argument, NULL, 0 },
+ { "mvrange-thread", required_argument, NULL, 0 },
{ "subme", required_argument, NULL, 'm' },
{ "b-rdo", no_argument, NULL, 0 },
{ "mixed-refs", no_argument, NULL, 0 },
{ "qpfile", required_argument, NULL, OPT_QPFILE },
{ "threads", required_argument, NULL, 0 },
{ "thread-input", no_argument, NULL, OPT_THREAD_INPUT },
+ { "non-deterministic", no_argument, NULL, 0 },
{ "no-psnr", no_argument, NULL, 0 },
{ "no-ssim", no_argument, NULL, 0 },
{ "quiet", no_argument, NULL, OPT_QUIET },
{
/* CPU flags */
unsigned int cpu;
- int i_threads; /* divide each frame into multiple slices, encode in parallel */
+ int i_threads; /* encode multiple frames in parallel */
+ int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
/* Video Properties */
int i_width;
int i_keyint_max; /* Force an IDR keyframe at this interval */
int i_keyint_min; /* Scenecuts closer together than this are coded as I, not IDR. */
int i_scenecut_threshold; /* how aggressively to insert extra I frames */
+ int b_pre_scenecut; /* compute scenecut on lowres frames */
int i_bframe; /* how many b-frame between 2 references pictures */
int b_bframe_adaptive;
int i_bframe_bias;
int i_me_method; /* motion estimation algorithm to use (X264_ME_*) */
int i_me_range; /* integer pixel motion estimation search range (from predicted mv) */
int i_mv_range; /* maximum length of a mv (in pixels). -1 = auto, based on level */
+ int i_mv_range_thread; /* minimum space between threads. -1 = auto, based on number of threads. */
int i_subpel_refine; /* subpixel motion estimation quality */
int b_bidir_me; /* jointly optimize both MVs in B-frames */
int b_chroma_me; /* chroma ME for subpel and mode decision in P-frames */