]> git.sesse.net Git - x264/blobdiff - encoder/encoder.c
denoise_dct asm
[x264] / encoder / encoder.c
index 7637dc2bd280b0e78a04c6ec686182db966aebd5..43f9f9f763257c63c5632c4a47c2cdd0509875c5 100644 (file)
 #endif
 
 //#define DEBUG_MB_TYPE
-//#define DEBUG_DUMP_FRAME
-//#define DEBUG_BENCHMARK
-
-#ifdef DEBUG_BENCHMARK
-static int64_t i_mtime_encode_frame = 0;
-static int64_t i_mtime_analyse = 0;
-static int64_t i_mtime_encode = 0;
-static int64_t i_mtime_write = 0;
-static int64_t i_mtime_filter = 0;
-#define TIMER_START( d ) \
-    { \
-        int64_t d##start = x264_mdate();
-
-#define TIMER_STOP( d ) \
-        d += x264_mdate() - d##start;\
-    }
-#else
-#define TIMER_START( d )
-#define TIMER_STOP( d )
-#endif
 
 #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
 
@@ -77,27 +57,19 @@ static float x264_psnr( int64_t i_sqe, int64_t i_size )
     return (float)(-10.0 * log( f_mse ) / log( 10.0 ));
 }
 
-#ifdef DEBUG_DUMP_FRAME
-static void x264_frame_dump( x264_t *h, x264_frame_t *fr, char *name )
+static void x264_frame_dump( x264_t *h )
 {
-    FILE *f = fopen( name, "r+b" );
+    FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
     int i, y;
     if( !f )
         return;
-
     /* Write the frame in display order */
-    fseek( f, fr->i_frame * h->param.i_height * h->param.i_width * 3 / 2, SEEK_SET );
-
-    for( i = 0; i < fr->i_plane; i++ )
-    {
-        for( y = 0; y < h->param.i_height / ( i == 0 ? 1 : 2 ); y++ )
-        {
-            fwrite( &fr->plane[i][y*fr->i_stride[i]], 1, h->param.i_width / ( i == 0 ? 1 : 2 ), f );
-        }
-    }
+    fseek( f, h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
+    for( i = 0; i < h->fdec->i_plane; i++ )
+        for( y = 0; y < h->param.i_height >> !!i; y++ )
+            fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f );
     fclose( f );
 }
-#endif
 
 
 /* Fill "default" values */
@@ -318,6 +290,34 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
     }
 }
 
+/* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
+/* reallocate, adding an arbitrary amount of space (100 kilobytes). */
+static void x264_bitstream_check_buffer( x264_t *h )
+{
+    if( ( h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500) )
+     || ( h->out.bs.p_end - h->out.bs.p < 2500 ) )
+    {
+        uint8_t *bs_bak = h->out.p_bitstream;
+        intptr_t delta;
+        int i;
+
+        h->out.i_bitstream += 100000;
+        h->out.p_bitstream = x264_realloc( h->out.p_bitstream, h->out.i_bitstream );
+        delta = h->out.p_bitstream - bs_bak;
+
+        h->out.bs.p_start += delta;
+        h->out.bs.p += delta;
+        h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream;
+
+        h->cabac.p_start += delta;
+        h->cabac.p += delta;
+        h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream;
+
+        for( i = 0; i <= h->out.i_nal; i++ )
+            h->out.nal[i].p_payload += delta;
+    }
+}
+
 /****************************************************************************
  *
  ****************************************************************************
@@ -328,6 +328,14 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
 static int x264_validate_parameters( x264_t *h )
 {
+#ifdef HAVE_MMX
+    if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+    {
+        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
+        return -1;
+    }
+#endif
     if( h->param.i_width <= 0 || h->param.i_height <= 0 )
     {
         x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
@@ -401,7 +409,6 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.b_fast_pskip = 0;
         h->param.analyse.i_noise_reduction = 0;
         h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
-        h->param.rc.i_aq_mode = 0;
     }
     if( h->param.rc.i_rc_method == X264_RC_CQP )
     {
@@ -410,6 +417,7 @@ static int x264_validate_parameters( x264_t *h )
         float qp_b = qp_p + 6*log(h->param.rc.f_pb_factor)/log(2);
         h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
         h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
+        h->param.rc.i_aq_mode = 0;
     }
 
     if( ( h->param.i_width % 16 || h->param.i_height % 16 )
@@ -570,6 +578,7 @@ static void mbcmp_init( x264_t *h )
 x264_t *x264_encoder_open   ( x264_param_t *param )
 {
     x264_t *h = x264_malloc( sizeof( x264_t ) );
+    char buf[1000], *p;
     int i;
 
     memset( h, 0, sizeof( x264_t ) );
@@ -658,6 +667,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
           || h->param.rc.i_rc_method == X264_RC_CRF
           || h->param.b_bframe_adaptive
           || h->param.b_pre_scenecut );
+    h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
 
     h->frames.i_last_idr = - h->param.i_keyint_max;
     h->frames.i_input    = 0;
@@ -684,19 +694,22 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
 
     mbcmp_init( h );
 
-    x264_log( h, X264_LOG_INFO, "using cpu capabilities: %s%s%s%s%s%s%s%s%s%s\n",
-             param->cpu&X264_CPU_MMX ? "MMX " : "",
-             param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
-             param->cpu&X264_CPU_SSE ? "SSE " : "",
-             param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
-             param->cpu&X264_CPU_SSE3 ? "SSE3 " : "",
-             param->cpu&X264_CPU_SSSE3 ? "SSSE3 " : "",
-             param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
-             param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "",
-             param->cpu&X264_CPU_CACHELINE_SPLIT ?
-                 param->cpu&X264_CPU_CACHELINE_32 ? "Cache32 " :
-                 param->cpu&X264_CPU_CACHELINE_64 ? "Cache64 " : "Cache? " : "",
-             param->cpu ? "" : "none!" );
+    p = buf + sprintf( buf, "using cpu capabilities:" );
+    for( i=0; x264_cpu_names[i].flags; i++ )
+    {
+        if( !strcmp(x264_cpu_names[i].name, "SSE2")
+            && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
+            continue;
+        if( !strcmp(x264_cpu_names[i].name, "SSE3")
+            && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
+            continue;
+        if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
+            && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
+            p += sprintf( p, " %s", x264_cpu_names[i].name );
+    }
+    if( !param->cpu )
+        p += sprintf( p, " none!" );
+    x264_log( h, X264_LOG_INFO, "%s\n", buf );
 
     h->out.i_nal = 0;
     h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
@@ -721,10 +734,10 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     if( x264_ratecontrol_new( h ) < 0 )
         return NULL;
 
-#ifdef DEBUG_DUMP_FRAME
+    if( h->param.psz_dump_yuv )
     {
         /* create or truncate the reconstructed video file */
-        FILE *f = fopen( "fdec.yuv", "w" );
+        FILE *f = fopen( h->param.psz_dump_yuv, "w" );
         if( f )
             fclose( f );
         else
@@ -734,7 +747,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
             return NULL;
         }
     }
-#endif
 
     return h;
 }
@@ -909,9 +921,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
     int b_end = mb_y == h->sps->i_mb_height;
     int min_y = mb_y - (1 << h->sh.b_mbaff);
-#ifndef DEBUG_DUMP_FRAME
-    b_deblock &= b_hpel;
-#endif
+    int max_y = b_end ? h->sps->i_mb_height : mb_y;
+    b_deblock &= b_hpel || h->param.psz_dump_yuv;
     if( mb_y & h->sh.b_mbaff )
         return;
     if( min_y < 0 )
@@ -931,7 +942,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
 
     if( b_deblock )
     {
-        int max_y = b_end ? h->sps->i_mb_height : mb_y;
         int y;
         for( y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) )
             x264_frame_deblock_row( h, y );
@@ -948,6 +958,33 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     {
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
     }
+
+    min_y = X264_MAX( min_y*16-8, 0 );
+    max_y = b_end ? h->param.i_height : mb_y*16-8;
+
+    if( h->param.analyse.b_psnr )
+    {
+        int i;
+        for( i=0; i<3; i++ )
+            h->stat.frame.i_ssd[i] +=
+                x264_pixel_ssd_wxh( &h->pixf,
+                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
+                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
+                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+    }
+
+    if( h->param.analyse.b_ssim )
+    {
+        x264_emms();
+        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
+         * and overlap by 4 */
+        min_y += min_y == 0 ? 2 : -6;
+        h->stat.frame.f_ssim +=
+            x264_pixel_ssim_wxh( &h->pixf,
+                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
+                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
+                h->param.i_width-2, max_y-min_y );
+    }
 }
 
 static inline void x264_reference_update( x264_t *h )
@@ -1035,7 +1072,7 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
 static void x264_slice_write( x264_t *h )
 {
     int i_skip;
-    int mb_xy;
+    int mb_xy, i_mb_x, i_mb_y;
     int i;
 
     /* init stats */
@@ -1058,10 +1095,12 @@ static void x264_slice_write( x264_t *h )
     h->mb.i_last_qp = h->sh.i_qp;
     h->mb.i_last_dqp = 0;
 
-    for( mb_xy = h->sh.i_first_mb, i_skip = 0; mb_xy < h->sh.i_last_mb; )
+    i_mb_y = h->sh.i_first_mb / h->sps->i_mb_width;
+    i_mb_x = h->sh.i_first_mb % h->sps->i_mb_width;
+    i_skip = 0;
+
+    while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) < h->sh.i_last_mb )
     {
-        const int i_mb_y = mb_xy / h->sps->i_mb_width;
-        const int i_mb_x = mb_xy % h->sps->i_mb_width;
         int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
 
         if( i_mb_x == 0 )
@@ -1074,16 +1113,13 @@ static void x264_slice_write( x264_t *h )
          * Slice I: choose I_4x4 or I_16x16 mode
          * Slice P: choose between using P mode or intra (4x4 or 16x16)
          * */
-        TIMER_START( i_mtime_analyse );
         x264_macroblock_analyse( h );
-        TIMER_STOP( i_mtime_analyse );
 
         /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
-        TIMER_START( i_mtime_encode );
         x264_macroblock_encode( h );
-        TIMER_STOP( i_mtime_encode );
 
-        TIMER_START( i_mtime_write );
+        x264_bitstream_check_buffer( h );
+
         if( h->param.b_cabac )
         {
             if( mb_xy > h->sh.i_first_mb && !(h->sh.b_mbaff && (i_mb_y&1)) )
@@ -1112,7 +1148,6 @@ static void x264_slice_write( x264_t *h )
                 x264_macroblock_write_cavlc( h, &h->out.bs );
             }
         }
-        TIMER_STOP( i_mtime_write );
 
 #if VISUALIZE
         if( h->param.b_visualize )
@@ -1151,15 +1186,16 @@ static void x264_slice_write( x264_t *h )
 
         if( h->sh.b_mbaff )
         {
-            if( (i_mb_y&1) && i_mb_x == h->sps->i_mb_width - 1 )
-                mb_xy++;
-            else if( i_mb_y&1 )
-                mb_xy += 1 - h->sps->i_mb_width;
-            else
-                mb_xy += h->sps->i_mb_width;
+            i_mb_x += i_mb_y & 1;
+            i_mb_y ^= i_mb_x < h->sps->i_mb_width;
         }
         else
-            mb_xy++;
+            i_mb_x++;
+        if(i_mb_x == h->sps->i_mb_width)
+        {
+            i_mb_y++;
+            i_mb_x = 0;
+        }
     }
 
     if( h->param.b_cabac )
@@ -1177,6 +1213,8 @@ static void x264_slice_write( x264_t *h )
 
     x264_nal_end( h );
 
+    x264_fdec_filter_row( h, h->sps->i_mb_height );
+
     /* Compute misc bits */
     h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
                               + NALU_OVERHEAD * 8
@@ -1223,7 +1261,6 @@ static int x264_slices_write( x264_t *h )
 
     x264_stack_align( x264_slice_write, h );
     i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
-    x264_fdec_filter_row( h, h->sps->i_mb_height );
 
 #if VISUALIZE
     if( h->param.b_visualize )
@@ -1289,7 +1326,6 @@ int     x264_encoder_encode( x264_t *h,
     *pp_nal = NULL;
 
     /* ------------------- Setup new frame from picture -------------------- */
-    TIMER_START( i_mtime_encode_frame );
     if( pic_in != NULL )
     {
         /* 1: Copy the picture to a frame and move it to a buffer */
@@ -1345,7 +1381,6 @@ int     x264_encoder_encode( x264_t *h,
         while( bframes-- )
             x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
     }
-    TIMER_STOP( i_mtime_encode_frame );
 
     /* ------------------- Get frame to be encoded ------------------------- */
     /* 4: get picture to encode */
@@ -1367,7 +1402,6 @@ do_encode:
 
     /* ------------------- Setup frame context ----------------------------- */
     /* 5: Init data dependent of frame type */
-    TIMER_START( i_mtime_encode_frame );
     if( h->fenc->i_type == X264_TYPE_IDR )
     {
         /* reset ref pictures */
@@ -1489,7 +1523,7 @@ do_encode:
         x264_slices_write( h );
 
     /* restore CPU state (before using float again) */
-    x264_cpu_restore( h->param.cpu );
+    x264_emms();
 
     if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read 
         && h->param.i_scenecut_threshold >= 0
@@ -1636,15 +1670,13 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     /* ---------------------- Update encoder state ------------------------- */
 
     /* update rc */
-    x264_cpu_restore( h->param.cpu );
+    x264_emms();
     x264_ratecontrol_end( h, h->out.i_frame_size * 8 );
 
     /* restore CPU state (before using float again) */
-    x264_cpu_restore( h->param.cpu );
-
-    x264_noise_reduction_update( h );
+    x264_emms();
 
-    TIMER_STOP( i_mtime_encode_frame );
+    x264_noise_reduction_update( thread_current );
 
     /* ---------------------- Compute/Print statistics --------------------- */
     x264_thread_sync_stat( h, h->thread[0] );
@@ -1684,16 +1716,11 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     psz_message[0] = '\0';
     if( h->param.analyse.b_psnr )
     {
-        int64_t sqe[3];
-
-        for( i=0; i<3; i++ )
-        {
-            sqe[i] = x264_pixel_ssd_wxh( &h->pixf,
-                         h->fdec->plane[i], h->fdec->i_stride[i],
-                         h->fenc->plane[i], h->fenc->i_stride[i],
-                         h->param.i_width >> !!i, h->param.i_height >> !!i );
-        }
-        x264_cpu_restore( h->param.cpu );
+        int64_t sqe[3] = {
+            h->stat.frame.i_ssd[0],
+            h->stat.frame.i_ssd[1],
+            h->stat.frame.i_ssd[2],
+        };
 
         h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2];
         h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
@@ -1709,11 +1736,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 
     if( h->param.analyse.b_ssim )
     {
-        // offset by 2 pixels to avoid alignment of ssim blocks with dct blocks
-        float ssim_y = x264_pixel_ssim_wxh( &h->pixf,
-                         h->fdec->plane[0] + 2+2*h->fdec->i_stride[0], h->fdec->i_stride[0],
-                         h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                         h->param.i_width-2, h->param.i_height-2 );
+        double ssim_y = h->stat.frame.f_ssim
+                      / (((h->param.i_width-6)>>2) * ((h->param.i_height-6)>>2));
         h->stat.f_ssim_mean_y[h->sh.i_type] += ssim_y;
         snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
                   " SSIM Y:%.5f", ssim_y );
@@ -1756,10 +1780,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 }
 #endif
 
-#ifdef DEBUG_DUMP_FRAME
-    /* Dump reconstructed frame */
-    x264_frame_dump( h, h->fdec, "fdec.yuv" );
-#endif
+    if( h->param.psz_dump_yuv )
+        x264_frame_dump( h );
 }
 
 /****************************************************************************
@@ -1767,28 +1789,16 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  ****************************************************************************/
 void    x264_encoder_close  ( x264_t *h )
 {
-#ifdef DEBUG_BENCHMARK
-    int64_t i_mtime_total = i_mtime_analyse + i_mtime_encode + i_mtime_write + i_mtime_filter + 1;
-#endif
     int64_t i_yuv_size = 3 * h->param.i_width * h->param.i_height / 2;
     int i;
 
     for( i=0; i<h->param.i_threads; i++ )
     {
-        // don't strictly have to wait for the other threads, but it's simpler than cancelling them
+        // don't strictly have to wait for the other threads, but it's simpler than canceling them
         if( h->thread[i]->b_thread_active )
             x264_pthread_join( h->thread[i]->thread_handle, NULL );
     }
 
-#ifdef DEBUG_BENCHMARK
-    x264_log( h, X264_LOG_INFO,
-              "analyse=%d(%lldms) encode=%d(%lldms) write=%d(%lldms) filter=%d(%lldms)\n",
-              (int)(100*i_mtime_analyse/i_mtime_total), i_mtime_analyse/1000,
-              (int)(100*i_mtime_encode/i_mtime_total), i_mtime_encode/1000,
-              (int)(100*i_mtime_write/i_mtime_total), i_mtime_write/1000,
-              (int)(100*i_mtime_filter/i_mtime_total), i_mtime_filter/1000 );
-#endif
-
     /* Slices used and PSNR */
     for( i=0; i<5; i++ )
     {