x86inc: Be more verbose in assertion failures

[x264] / encoder / rdo.c
diff --git a/encoder/rdo.c b/encoder/rdo.c

index b1495d42073a50130c25d69c2ede4229d9cdf491..d4c6ba30bcbe8f24141f851d0601ad31fad50656 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -1,7 +1,7 @@
  /*****************************************************************************
   * rdo.c: rate-distortion optimization
   *****************************************************************************
- * Copyright (C) 2005-2011 x264 project
+ * Copyright (C) 2005-2015 x264 project
   *
   * Authors: Loren Merritt <lorenm@u.washington.edu>
   *          Fiona Glaser <fiona@x264.com>
@@ -180,13 +180,13 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
      else
      {
          x264_macroblock_size_cavlc( h );
-        i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
+        i_bits = ( (uint64_t)h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
  
      h->mb.b_transform_8x8 = b_transform_bak;
      h->mb.i_type = type_bak;
  
-    return i_ssd + i_bits;
+    return X264_MIN( i_ssd + i_bits, COST_MAX );
  }
  
  /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
@@ -261,7 +261,7 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
-        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
  
      return (i_ssd<<8) + i_bits;
  }
@@ -297,7 +297,7 @@ static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode,
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
-        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
  
      return (i_ssd<<8) + i_bits;
  }
@@ -331,7 +331,7 @@ static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
-        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
  
      return (i_ssd<<8) + i_bits;
  }
@@ -357,7 +357,7 @@ static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
-        i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;
+        i_bits = (uint64_t)x264_chroma_size_cavlc( h ) * i_lambda2;
  
      return (i_ssd<<8) + i_bits;
  }
@@ -634,13 +634,13 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                           const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
                           int b_chroma, int dc, int num_coefs, int idx )
  {
-    ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] );
-    ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] );
+    ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
+    ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
      const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
      const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
      const int b_interlaced = MB_INTERLACED;
-    uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
-    uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
      int levelgt1_ctx = b_chroma && dc ? 8 : 9;
  
      if( dc )
@@ -683,7 +683,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
      }
  
      int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac;
-    uint8_t *cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ];
+    uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ];
  
      /* shortcut for dc-only blocks.
       * this doesn't affect the output, but saves some unnecessary computation. */
@@ -691,9 +691,29 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
      {
          int cost_sig = x264_cabac_size_decision_noup2( &cabac_state_sig[0], 1 )
                       + x264_cabac_size_decision_noup2( &cabac_state_last[0], 1 );
-        return dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig );
+        dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig );
+        return !!dct[0];
      }
  
+#if HAVE_MMX && ARCH_X86_64
+#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
+                     cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
+    if( num_coefs == 16 && !dc )
+        if( b_chroma || !h->mb.i_psy_trellis )
+            return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac );
+        else
+            return h->quantf.trellis_cabac_4x4_psy( TRELLIS_ARGS, b_ac, h->mb.pic.fenc_dct4[idx&15], h->mb.i_psy_trellis );
+    else if( num_coefs == 64 && !dc )
+        if( b_chroma || !h->mb.i_psy_trellis )
+            return h->quantf.trellis_cabac_8x8( TRELLIS_ARGS, b_interlaced );
+        else
+            return h->quantf.trellis_cabac_8x8_psy( TRELLIS_ARGS, b_interlaced, h->mb.pic.fenc_dct8[idx&3], h->mb.i_psy_trellis);
+    else if( num_coefs == 8 && dc )
+        return h->quantf.trellis_cabac_chroma_422_dc( TRELLIS_ARGS );
+    else if( dc )
+        return h->quantf.trellis_cabac_dc( TRELLIS_ARGS, num_coefs-1 );
+#endif
+
      // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
      // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
      // but it takes more time to remove dead states than you gain in reduced memory.
@@ -1141,5 +1161,6 @@ int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
          h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
          nzaccum |= nz;
      }
+    STORE_8x8_NNZ( 0, idx, 0 );
      return nzaccum;
  }