aarch64: Remove commas LLVM's assembler complains about

[x264] / common / common.h
diff --git a/common/common.h b/common/common.h

index a4e1cf96a413849bd0cc91c5de0e1ce50d727574..9cda1871ac4356a4908414d1407f74554b18a61c 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -1,7 +1,7 @@
  /*****************************************************************************
   * common.h: misc common functions
   *****************************************************************************
- * Copyright (C) 2003-2011 x264 project
+ * Copyright (C) 2003-2015 x264 project
   *
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
@@ -40,6 +40,7 @@
  #define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
  #define FIX8(f) ((int)(f*(1<<8)+.5))
  #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+#define ARRAY_ELEMS(a) ((sizeof(a))/(sizeof(a[0])))
  
  #define CHECKED_MALLOC( var, size )\
  do {\
@@ -53,10 +54,38 @@ do {\
      memset( var, 0, size );\
  } while( 0 )
  
+/* Macros for merging multiple allocations into a single large malloc, for improved
+ * use with huge pages. */
+
+/* Needs to be enough to contain any set of buffers that use combined allocations */
+#define PREALLOC_BUF_SIZE 1024
+
+#define PREALLOC_INIT\
+    int    prealloc_idx = 0;\
+    size_t prealloc_size = 0;\
+    uint8_t **preallocs[PREALLOC_BUF_SIZE];
+
+#define PREALLOC( var, size )\
+do {\
+    var = (void*)prealloc_size;\
+    preallocs[prealloc_idx++] = (uint8_t**)&var;\
+    prealloc_size += ALIGN(size, NATIVE_ALIGN);\
+} while(0)
+
+#define PREALLOC_END( ptr )\
+do {\
+    CHECKED_MALLOC( ptr, prealloc_size );\
+    while( prealloc_idx-- )\
+        *preallocs[prealloc_idx] += (intptr_t)ptr;\
+} while(0)
+
+#define ARRAY_SIZE(array)  (sizeof(array)/sizeof(array[0]))
+
  #define X264_BFRAME_MAX 16
  #define X264_REF_MAX 16
  #define X264_THREAD_MAX 128
-#define X264_PCM_COST ((384<<CHROMA444)*BIT_DEPTH+16)
+#define X264_LOOKAHEAD_THREAD_MAX 16
+#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
  #define X264_LOOKAHEAD_MAX 250
  #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
  #define QP_MAX_SPEC (51+QP_BD_OFFSET)
@@ -80,6 +109,7 @@ do {\
  
  #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
  #define FILLER_OVERHEAD (NALU_OVERHEAD+1)
+#define SEI_OVERHEAD (NALU_OVERHEAD - (h->param.b_annexb && !h->param.i_avcintra_class && (h->out.i_nal-1)))
  
  /****************************************************************************
   * Includes
@@ -102,7 +132,18 @@ do {\
  #   define PARAM_INTERLACED 0
  #endif
  
-#define CHROMA444 (h->sps->i_chroma_format_idc == 3)
+#ifdef CHROMA_FORMAT
+#    define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
+#    define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
+#else
+#    define CHROMA_FORMAT h->sps->i_chroma_format_idc
+#    define CHROMA_H_SHIFT h->mb.chroma_h_shift
+#    define CHROMA_V_SHIFT h->mb.chroma_v_shift
+#endif
+
+#define CHROMA_SIZE(s) ((s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT))
+#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
+#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
  
  /* Unions for type-punning.
   * Mn: load or store n bits, aligned, native-endian
@@ -190,6 +231,10 @@ static const uint8_t x264_scan8[16*3 + 3] =
  };
  
  #include "x264.h"
+#if HAVE_OPENCL
+#include "opencl.h"
+#endif
+#include "cabac.h"
  #include "bitstream.h"
  #include "set.h"
  #include "predict.h"
@@ -197,7 +242,6 @@ static const uint8_t x264_scan8[16*3 + 3] =
  #include "mc.h"
  #include "frame.h"
  #include "dct.h"
-#include "cabac.h"
  #include "quant.h"
  #include "cpu.h"
  #include "threadpool.h"
@@ -225,7 +269,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  
  void x264_reduce_fraction( uint32_t *n, uint32_t *d );
  void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
-void x264_cavlc_init( void );
+void x264_cavlc_init( x264_t *h );
  void x264_cabac_init( x264_t *h );
  
  static ALWAYS_INLINE pixel x264_clip_pixel( int x )
@@ -272,24 +316,13 @@ static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t
  
  static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
  {
-    int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
-    int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
+    int amvd0 = mvdleft[0] + mvdtop[0];
+    int amvd1 = mvdleft[1] + mvdtop[1];
      amvd0 = (amvd0 > 2) + (amvd0 > 32);
      amvd1 = (amvd1 > 2) + (amvd1 > 32);
      return amvd0 + (amvd1<<8);
  }
  
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
-{
-    for( int i = 0; i < i_mvc; i++ )
-    {
-        int mx = (mvc[i][0] + 2) >> 2;
-        int my = (mvc[i][1] + 2) >> 2;
-        dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
-        dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
-    }
-}
-
  extern const uint8_t x264_exp2_lut[64];
  extern const float x264_log2_lut[128];
  extern const float x264_log2_lz_lut[32];
@@ -374,6 +407,7 @@ typedef struct
      } ref_pic_list_order[2][X264_REF_MAX];
  
      /* P-frame weighting */
+    int b_weighted_pred;
      x264_weight_t weight[X264_REF_MAX*2][3];
  
      int i_mmco_remove_from_end;
@@ -423,17 +457,51 @@ typedef struct x264_left_table_t
      uint8_t ref[4];
  } x264_left_table_t;
  
+/* Current frame stats */
+typedef struct
+{
+    /* MV bits (MV+Ref+Block Type) */
+    int i_mv_bits;
+    /* Texture bits (DCT coefs) */
+    int i_tex_bits;
+    /* ? */
+    int i_misc_bits;
+    /* MB type counts */
+    int i_mb_count[19];
+    int i_mb_count_i;
+    int i_mb_count_p;
+    int i_mb_count_skip;
+    int i_mb_count_8x8dct[2];
+    int i_mb_count_ref[2][X264_REF_MAX*2];
+    int i_mb_partition[17];
+    int i_mb_cbp[6];
+    int i_mb_pred_mode[4][13];
+    int i_mb_field[3];
+    /* Adaptive direct mv pred */
+    int i_direct_score[2];
+    /* Metrics */
+    int64_t i_ssd[3];
+    double f_ssim;
+    int i_ssim_cnt;
+} x264_frame_stat_t;
+
  struct x264_t
  {
      /* encoder parameters */
      x264_param_t    param;
  
      x264_t          *thread[X264_THREAD_MAX+1];
+    x264_t          *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
      int             b_thread_active;
      int             i_thread_phase; /* which thread to use for the next frame */
+    int             i_thread_idx;   /* which thread this is */
      int             i_threadslice_start; /* first row in this thread slice */
      int             i_threadslice_end; /* row after the end of this thread slice */
+    int             i_threadslice_pass; /* which pass of encoding we are on */
      x264_threadpool_t *threadpool;
+    x264_threadpool_t *lookaheadpool;
+    x264_pthread_mutex_t mutex;
+    x264_pthread_cond_t cv;
  
      /* bitstream output */
      struct
@@ -449,6 +517,9 @@ struct x264_t
      uint8_t *nal_buffer;
      int      nal_buffer_size;
  
+    x264_t          *reconfig_h;
+    int             reconfig;
+
      /**** thread synchronization starts here ****/
  
      /* frame number/poc */
@@ -470,6 +541,7 @@ struct x264_t
      int64_t         i_cpb_delay_lookahead;
  
      int64_t         i_cpb_delay_pir_offset;
+    int64_t         i_cpb_delay_pir_offset_next;
  
      int             b_queued_intra_refresh;
      int64_t         i_last_idr_pts;
@@ -480,13 +552,15 @@ struct x264_t
      int             (*dequant4_mf[4])[16];   /* [4][6][16] */
      int             (*dequant8_mf[4])[64];   /* [4][6][64] */
      /* quantization matrix for trellis, [cqm][qp][coef] */
-    int             (*unquant4_mf[4])[16];   /* [4][52][16] */
-    int             (*unquant8_mf[4])[64];   /* [4][52][64] */
+    int             (*unquant4_mf[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
+    int             (*unquant8_mf[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
      /* quantization matrix for deadzone */
-    udctcoef        (*quant4_mf[4])[16];     /* [4][52][16] */
-    udctcoef        (*quant8_mf[4])[64];     /* [4][52][64] */
-    udctcoef        (*quant4_bias[4])[16];   /* [4][52][16] */
-    udctcoef        (*quant8_bias[4])[64];   /* [4][52][64] */
+    udctcoef        (*quant4_mf[4])[16];     /* [4][QP_MAX_SPEC+1][16] */
+    udctcoef        (*quant8_mf[4])[64];     /* [4][QP_MAX_SPEC+1][64] */
+    udctcoef        (*quant4_bias[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
+    udctcoef        (*quant8_bias[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
+    udctcoef        (*quant4_bias0[4])[16];  /* [4][QP_MAX_SPEC+1][16] */
+    udctcoef        (*quant8_bias0[4])[64];  /* [4][QP_MAX_SPEC+1][64] */
      udctcoef        (*nr_offset_emergency)[4][64];
  
      /* mv/ref cost arrays. */
@@ -564,11 +638,11 @@ struct x264_t
      /* Current MB DCT coeffs */
      struct
      {
-        ALIGNED_16( dctcoef luma16x16_dc[3][16] );
-        ALIGNED_16( dctcoef chroma_dc[2][4] );
+        ALIGNED_N( dctcoef luma16x16_dc[3][16] );
+        ALIGNED_16( dctcoef chroma_dc[2][8] );
          // FIXME share memory?
-        ALIGNED_16( dctcoef luma8x8[12][64] );
-        ALIGNED_16( dctcoef luma4x4[16*3][16] );
+        ALIGNED_N( dctcoef luma8x8[12][64] );
+        ALIGNED_N( dctcoef luma4x4[16*3][16] );
      } dct;
  
      /* MB table and cache for current frame/mb */
@@ -578,6 +652,10 @@ struct x264_t
          int     i_mb_height;
          int     i_mb_count;                 /* number of mbs in a frame */
  
+        /* Chroma subsampling */
+        int     chroma_h_shift;
+        int     chroma_v_shift;
+
          /* Strides */
          int     i_mb_stride;
          int     i_b8_stride;
@@ -617,8 +695,7 @@ struct x264_t
          int     mv_miny_spel_row[3];
          int     mv_maxy_spel_row[3];
          /* Fullpel MV range for motion search */
-        int     mv_min_fpel[2];
-        int     mv_max_fpel[2];
+        ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
          int     mv_miny_fpel_row[3];
          int     mv_maxy_fpel_row[3];
  
@@ -651,6 +728,7 @@ struct x264_t
           * and won't be copied from one thread to another */
  
          /* mb table */
+        uint8_t *base;                      /* base pointer for all malloced data in this mb */
          int8_t  *type;                      /* mb type */
          uint8_t *partition;                 /* mb partition */
          int8_t  *qp;                        /* mb qp */
@@ -704,7 +782,7 @@ struct x264_t
  #define FENC_STRIDE 16
  #define FDEC_STRIDE 32
              ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
+            ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
  
              /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
              ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
@@ -721,8 +799,8 @@ struct x264_t
              ALIGNED_16( dctcoef fenc_dct4[16][16] );
  
              /* Psy RD SATD/SA8D scores cache */
-            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
-            ALIGNED_16( uint32_t fenc_satd_cache[32] );
+            ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
+            ALIGNED_N( uint32_t fenc_satd_cache[32] );
  
              /* pointer over mb of the frame to be compressed */
              pixel *p_fenc[3]; /* y,u,v */
@@ -778,6 +856,9 @@ struct x264_t
              /* extra data required for mbaff in mv prediction */
              int16_t topright_mv[2][3][2];
              int8_t  topright_ref[2][3];
+
+            /* current mb deblock strength */
+            uint8_t (*deblock_strength)[8][4];
          } cache;
  
          /* */
@@ -814,34 +895,6 @@ struct x264_t
      /* stats */
      struct
      {
-        /* Current frame stats */
-        struct
-        {
-            /* MV bits (MV+Ref+Block Type) */
-            int i_mv_bits;
-            /* Texture bits (DCT coefs) */
-            int i_tex_bits;
-            /* ? */
-            int i_misc_bits;
-            /* MB type counts */
-            int i_mb_count[19];
-            int i_mb_count_i;
-            int i_mb_count_p;
-            int i_mb_count_skip;
-            int i_mb_count_8x8dct[2];
-            int i_mb_count_ref[2][X264_REF_MAX*2];
-            int i_mb_partition[17];
-            int i_mb_cbp[6];
-            int i_mb_pred_mode[4][13];
-            int i_mb_field[3];
-            /* Adaptive direct mv pred */
-            int i_direct_score[2];
-            /* Metrics */
-            int64_t i_ssd[3];
-            double f_ssim;
-            int i_ssim_cnt;
-        } frame;
-
          /* Cumulated stats */
  
          /* per slice info */
@@ -871,6 +924,8 @@ struct x264_t
          /* num p-frames weighted */
          int     i_wpred[2];
  
+        /* Current frame stats */
+        x264_frame_stat_t frame;
      } stat;
  
      /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
@@ -878,12 +933,15 @@ struct x264_t
      uint32_t (*nr_residual_sum)[64];
      uint32_t *nr_count;
  
-    ALIGNED_16( udctcoef nr_offset_denoise[4][64] );
-    ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
+    ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
+    ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
      uint32_t nr_count_buf[2][4];
  
+    uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
+
      /* Buffers that are allocated per-thread even in sliced threads. */
      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+    void *scratch_buffer2; /* if the first one's already in use */
      pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
      /* Deblock strength values are stored for each 4x4 partition. In MBAFF
       * there are four extra values that need to be stored, located in [4][i]. */
@@ -891,9 +949,11 @@ struct x264_t
  
      /* CPU functions dependents */
      x264_predict_t      predict_16x16[4+3];
-    x264_predict_t      predict_8x8c[4+3];
      x264_predict8x8_t   predict_8x8[9+3];
      x264_predict_t      predict_4x4[9+3];
+    x264_predict_t      predict_chroma[4+3];
+    x264_predict_t      predict_8x8c[4+3];
+    x264_predict_t      predict_8x16c[4+3];
      x264_predict_8x8_filter_t predict_8x8_filter;
  
      x264_pixel_function_t pixf;
@@ -906,15 +966,49 @@ struct x264_t
      x264_deblock_function_t loopf;
      x264_bitstream_function_t bsf;
  
-#if HAVE_VISUALIZE
-    struct visualize_t *visualize;
-#endif
      x264_lookahead_t *lookahead;
+
+#if HAVE_OPENCL
+    x264_opencl_t opencl;
+#endif
  };
  
  // included at the end because it needs x264_t
  #include "macroblock.h"
  
+static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    int cnt = 0;
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        int mx = (mvc[i][0] + 2) >> 2;
+        int my = (mvc[i][1] + 2) >> 2;
+        uint32_t mv = pack16to32_mask(mx, my);
+        if( !mv || mv == pmv ) continue;
+        dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
+        dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
+        cnt++;
+    }
+    return cnt;
+}
+
+static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    int cnt = 0;
+    int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        uint32_t mv = M32( mvc[i] );
+        int mx = mvc[i][0];
+        int my = mvc[i][1];
+        if( !mv || mv == pmv ) continue;
+        dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
+        dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
+        cnt++;
+    }
+    return cnt;
+}
+
  #if ARCH_X86 || ARCH_X86_64
  #include "x86/util.h"
  #endif