]> git.sesse.net Git - x264/blobdiff - encoder/encoder.c
denoise_dct asm
[x264] / encoder / encoder.c
index 35ebad138b5f1cb23985b0891165994f2dfd4c9b..43f9f9f763257c63c5632c4a47c2cdd0509875c5 100644 (file)
@@ -36,7 +36,6 @@
 #endif
 
 //#define DEBUG_MB_TYPE
-//#define DEBUG_DUMP_FRAME
 
 #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
 
@@ -58,27 +57,19 @@ static float x264_psnr( int64_t i_sqe, int64_t i_size )
     return (float)(-10.0 * log( f_mse ) / log( 10.0 ));
 }
 
-#ifdef DEBUG_DUMP_FRAME
-static void x264_frame_dump( x264_t *h, x264_frame_t *fr, char *name )
+static void x264_frame_dump( x264_t *h )
 {
-    FILE *f = fopen( name, "r+b" );
+    FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
     int i, y;
     if( !f )
         return;
-
     /* Write the frame in display order */
-    fseek( f, fr->i_frame * h->param.i_height * h->param.i_width * 3 / 2, SEEK_SET );
-
-    for( i = 0; i < fr->i_plane; i++ )
-    {
-        for( y = 0; y < h->param.i_height / ( i == 0 ? 1 : 2 ); y++ )
-        {
-            fwrite( &fr->plane[i][y*fr->i_stride[i]], 1, h->param.i_width / ( i == 0 ? 1 : 2 ), f );
-        }
-    }
+    fseek( f, h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
+    for( i = 0; i < h->fdec->i_plane; i++ )
+        for( y = 0; y < h->param.i_height >> !!i; y++ )
+            fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f );
     fclose( f );
 }
-#endif
 
 
 /* Fill "default" values */
@@ -299,6 +290,34 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
     }
 }
 
+/* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
+/* reallocate, adding an arbitrary amount of space (100 kilobytes). */
+static void x264_bitstream_check_buffer( x264_t *h )
+{
+    if( ( h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500) )
+     || ( h->out.bs.p_end - h->out.bs.p < 2500 ) )
+    {
+        uint8_t *bs_bak = h->out.p_bitstream;
+        intptr_t delta;
+        int i;
+
+        h->out.i_bitstream += 100000;
+        h->out.p_bitstream = x264_realloc( h->out.p_bitstream, h->out.i_bitstream );
+        delta = h->out.p_bitstream - bs_bak;
+
+        h->out.bs.p_start += delta;
+        h->out.bs.p += delta;
+        h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream;
+
+        h->cabac.p_start += delta;
+        h->cabac.p += delta;
+        h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream;
+
+        for( i = 0; i <= h->out.i_nal; i++ )
+            h->out.nal[i].p_payload += delta;
+    }
+}
+
 /****************************************************************************
  *
  ****************************************************************************
@@ -309,6 +328,14 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
 static int x264_validate_parameters( x264_t *h )
 {
+#ifdef HAVE_MMX
+    if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+    {
+        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
+        return -1;
+    }
+#endif
     if( h->param.i_width <= 0 || h->param.i_height <= 0 )
     {
         x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
@@ -382,7 +409,6 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.b_fast_pskip = 0;
         h->param.analyse.i_noise_reduction = 0;
         h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
-        h->param.rc.i_aq_mode = 0;
     }
     if( h->param.rc.i_rc_method == X264_RC_CQP )
     {
@@ -391,6 +417,7 @@ static int x264_validate_parameters( x264_t *h )
         float qp_b = qp_p + 6*log(h->param.rc.f_pb_factor)/log(2);
         h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
         h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
+        h->param.rc.i_aq_mode = 0;
     }
 
     if( ( h->param.i_width % 16 || h->param.i_height % 16 )
@@ -640,6 +667,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
           || h->param.rc.i_rc_method == X264_RC_CRF
           || h->param.b_bframe_adaptive
           || h->param.b_pre_scenecut );
+    h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
 
     h->frames.i_last_idr = - h->param.i_keyint_max;
     h->frames.i_input    = 0;
@@ -668,9 +696,17 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
 
     p = buf + sprintf( buf, "using cpu capabilities:" );
     for( i=0; x264_cpu_names[i].flags; i++ )
+    {
+        if( !strcmp(x264_cpu_names[i].name, "SSE2")
+            && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
+            continue;
+        if( !strcmp(x264_cpu_names[i].name, "SSE3")
+            && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
+            continue;
         if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
             && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
             p += sprintf( p, " %s", x264_cpu_names[i].name );
+    }
     if( !param->cpu )
         p += sprintf( p, " none!" );
     x264_log( h, X264_LOG_INFO, "%s\n", buf );
@@ -698,10 +734,10 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     if( x264_ratecontrol_new( h ) < 0 )
         return NULL;
 
-#ifdef DEBUG_DUMP_FRAME
+    if( h->param.psz_dump_yuv )
     {
         /* create or truncate the reconstructed video file */
-        FILE *f = fopen( "fdec.yuv", "w" );
+        FILE *f = fopen( h->param.psz_dump_yuv, "w" );
         if( f )
             fclose( f );
         else
@@ -711,7 +747,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
             return NULL;
         }
     }
-#endif
 
     return h;
 }
@@ -886,9 +921,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
     int b_end = mb_y == h->sps->i_mb_height;
     int min_y = mb_y - (1 << h->sh.b_mbaff);
-#ifndef DEBUG_DUMP_FRAME
-    b_deblock &= b_hpel;
-#endif
+    int max_y = b_end ? h->sps->i_mb_height : mb_y;
+    b_deblock &= b_hpel || h->param.psz_dump_yuv;
     if( mb_y & h->sh.b_mbaff )
         return;
     if( min_y < 0 )
@@ -908,7 +942,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
 
     if( b_deblock )
     {
-        int max_y = b_end ? h->sps->i_mb_height : mb_y;
         int y;
         for( y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) )
             x264_frame_deblock_row( h, y );
@@ -925,6 +958,33 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     {
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
     }
+
+    min_y = X264_MAX( min_y*16-8, 0 );
+    max_y = b_end ? h->param.i_height : mb_y*16-8;
+
+    if( h->param.analyse.b_psnr )
+    {
+        int i;
+        for( i=0; i<3; i++ )
+            h->stat.frame.i_ssd[i] +=
+                x264_pixel_ssd_wxh( &h->pixf,
+                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
+                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
+                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+    }
+
+    if( h->param.analyse.b_ssim )
+    {
+        x264_emms();
+        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
+         * and overlap by 4 */
+        min_y += min_y == 0 ? 2 : -6;
+        h->stat.frame.f_ssim +=
+            x264_pixel_ssim_wxh( &h->pixf,
+                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
+                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
+                h->param.i_width-2, max_y-min_y );
+    }
 }
 
 static inline void x264_reference_update( x264_t *h )
@@ -1058,6 +1118,8 @@ static void x264_slice_write( x264_t *h )
         /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
         x264_macroblock_encode( h );
 
+        x264_bitstream_check_buffer( h );
+
         if( h->param.b_cabac )
         {
             if( mb_xy > h->sh.i_first_mb && !(h->sh.b_mbaff && (i_mb_y&1)) )
@@ -1151,6 +1213,8 @@ static void x264_slice_write( x264_t *h )
 
     x264_nal_end( h );
 
+    x264_fdec_filter_row( h, h->sps->i_mb_height );
+
     /* Compute misc bits */
     h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
                               + NALU_OVERHEAD * 8
@@ -1197,7 +1261,6 @@ static int x264_slices_write( x264_t *h )
 
     x264_stack_align( x264_slice_write, h );
     i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
-    x264_fdec_filter_row( h, h->sps->i_mb_height );
 
 #if VISUALIZE
     if( h->param.b_visualize )
@@ -1613,7 +1676,7 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     /* restore CPU state (before using float again) */
     x264_emms();
 
-    x264_noise_reduction_update( h );
+    x264_noise_reduction_update( thread_current );
 
     /* ---------------------- Compute/Print statistics --------------------- */
     x264_thread_sync_stat( h, h->thread[0] );
@@ -1653,16 +1716,11 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     psz_message[0] = '\0';
     if( h->param.analyse.b_psnr )
     {
-        int64_t sqe[3];
-
-        for( i=0; i<3; i++ )
-        {
-            sqe[i] = x264_pixel_ssd_wxh( &h->pixf,
-                         h->fdec->plane[i], h->fdec->i_stride[i],
-                         h->fenc->plane[i], h->fenc->i_stride[i],
-                         h->param.i_width >> !!i, h->param.i_height >> !!i );
-        }
-        x264_emms();
+        int64_t sqe[3] = {
+            h->stat.frame.i_ssd[0],
+            h->stat.frame.i_ssd[1],
+            h->stat.frame.i_ssd[2],
+        };
 
         h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2];
         h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
@@ -1678,11 +1736,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 
     if( h->param.analyse.b_ssim )
     {
-        // offset by 2 pixels to avoid alignment of ssim blocks with dct blocks
-        float ssim_y = x264_pixel_ssim_wxh( &h->pixf,
-                         h->fdec->plane[0] + 2+2*h->fdec->i_stride[0], h->fdec->i_stride[0],
-                         h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                         h->param.i_width-2, h->param.i_height-2 );
+        double ssim_y = h->stat.frame.f_ssim
+                      / (((h->param.i_width-6)>>2) * ((h->param.i_height-6)>>2));
         h->stat.f_ssim_mean_y[h->sh.i_type] += ssim_y;
         snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
                   " SSIM Y:%.5f", ssim_y );
@@ -1725,10 +1780,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 }
 #endif
 
-#ifdef DEBUG_DUMP_FRAME
-    /* Dump reconstructed frame */
-    x264_frame_dump( h, h->fdec, "fdec.yuv" );
-#endif
+    if( h->param.psz_dump_yuv )
+        x264_frame_dump( h );
 }
 
 /****************************************************************************
@@ -1741,7 +1794,7 @@ void    x264_encoder_close  ( x264_t *h )
 
     for( i=0; i<h->param.i_threads; i++ )
     {
-        // don't strictly have to wait for the other threads, but it's simpler than cancelling them
+        // don't strictly have to wait for the other threads, but it's simpler than canceling them
         if( h->thread[i]->b_thread_active )
             x264_pthread_join( h->thread[i]->thread_handle, NULL );
     }