x86: Add asm for mbtree fixed point conversion

[x264] / common / macroblock.h
diff --git a/common/macroblock.h b/common/macroblock.h

index 3e75c4c83443fb54fe2ee9d78fe4f32cdd5428a3..9a556ac64e5f2bfba457ddff8896c6359bc26176 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -1,7 +1,7 @@
  /*****************************************************************************
- * macroblock.h: h264 encoder library
+ * macroblock.h: macroblock common functions
   *****************************************************************************
- * Copyright (C) 2005-2008 x264 project
+ * Copyright (C) 2005-2016 x264 project
   *
   * Authors: Loren Merritt <lorenm@u.washington.edu>
   *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -20,6 +20,9 @@
   * You should have received a copy of the GNU General Public License
   * along with this program; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
   *****************************************************************************/
  
  #ifndef X264_MACROBLOCK_H
@@ -167,7 +170,11 @@ static const uint8_t x264_mb_partition_count_table[17] =
  };
  static const uint8_t x264_mb_partition_pixel_table[17] =
  {
-    6, 4, 5, 3, 6, 4, 5, 3, 6, 4, 5, 3, 3, 3, 1, 2, 0
+    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L0_* */
+    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L1_* */
+    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_BI_* */
+    PIXEL_8x8,                                      /* D_DIRECT_8x8 */
+    PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */
  };
  
  /* zigzags are transposed with respect to the tables in the standard */
@@ -238,40 +245,80 @@ static const uint16_t block_idx_xy_fdec[16] =
      2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
  };
  
-static const uint8_t i_chroma_qp_table[52+12*2] =
+#define QP(qP) ( (qP)+QP_BD_OFFSET )
+static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
  {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
-    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
-    39, 39,
-    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+         0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,
+#if BIT_DEPTH > 9
+   QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
+#endif
+#if BIT_DEPTH > 8
+    QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
+#endif
+     QP(0),  QP(1),  QP(2),  QP(3),  QP(4),  QP(5),
+     QP(6),  QP(7),  QP(8),  QP(9), QP(10), QP(11),
+    QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
+    QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
+    QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
+    QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
+    QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
+    QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
+    QP(39), QP(39), QP(39), QP(39),
+    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
+    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
  };
+#undef QP
  
  enum cabac_ctx_block_cat_e
  {
-    DCT_LUMA_DC   = 0,
-    DCT_LUMA_AC   = 1,
-    DCT_LUMA_4x4  = 2,
-    DCT_CHROMA_DC = 3,
-    DCT_CHROMA_AC = 4,
-    DCT_LUMA_8x8  = 5,
+    DCT_LUMA_DC     = 0,
+    DCT_LUMA_AC     = 1,
+    DCT_LUMA_4x4    = 2,
+    DCT_CHROMA_DC   = 3,
+    DCT_CHROMA_AC   = 4,
+    DCT_LUMA_8x8    = 5,
+    DCT_CHROMAU_DC  = 6,
+    DCT_CHROMAU_AC  = 7,
+    DCT_CHROMAU_4x4 = 8,
+    DCT_CHROMAU_8x8 = 9,
+    DCT_CHROMAV_DC  = 10,
+    DCT_CHROMAV_AC  = 11,
+    DCT_CHROMAV_4x4 = 12,
+    DCT_CHROMAV_8x8 = 13,
+};
+
+static const uint8_t ctx_cat_plane[6][3] =
+{
+    { DCT_LUMA_DC,  DCT_CHROMAU_DC,  DCT_CHROMAV_DC},
+    { DCT_LUMA_AC,  DCT_CHROMAU_AC,  DCT_CHROMAV_AC},
+    {DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
+    {0},
+    {0},
+    {DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
  };
  
+/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
+int  x264_macroblock_cache_allocate( x264_t *h );
+void x264_macroblock_cache_free( x264_t *h );
+
+/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
+int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
+void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
  
-int  x264_macroblock_cache_init( x264_t *h );
  void x264_macroblock_slice_init( x264_t *h );
  void x264_macroblock_thread_init( x264_t *h );
-void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y );
+void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
+void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
+void x264_macroblock_deblock_strength( x264_t *h );
  void x264_macroblock_cache_save( x264_t *h );
-void x264_macroblock_cache_end( x264_t *h );
  
  void x264_macroblock_bipred_init( x264_t *h );
  
  void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
  
+void x264_copy_column8( pixel *dst, pixel *src );
+
  /* x264_mb_predict_mv_16x16:
   *      set mvp with predicted mv for D_16x16 block
   *      h->mb. need only valid values from other blocks */
@@ -301,25 +348,25 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
  void x264_mb_mc( x264_t *h );
  void x264_mb_mc_8x8( x264_t *h, int i8 );
  
-static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
+static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b )
  {
-#ifdef WORDS_BIGENDIAN
+#if WORDS_BIGENDIAN
     return b + (a<<16);
  #else
     return a + (b<<16);
  #endif
  }
-static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
+static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b )
  {
-#ifdef WORDS_BIGENDIAN
+#if WORDS_BIGENDIAN
     return b + (a<<8);
  #else
     return a + (b<<8);
  #endif
  }
-static ALWAYS_INLINE uint32_t pack8to32( int a, int b, int c, int d )
+static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
  {
-#ifdef WORDS_BIGENDIAN
+#if WORDS_BIGENDIAN
     return d + (c<<8) + (b<<16) + (a<<24);
  #else
     return a + (b<<8) + (c<<16) + (d<<24);
@@ -327,31 +374,29 @@ static ALWAYS_INLINE uint32_t pack8to32( int a, int b, int c, int d )
  }
  static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
  {
-#ifdef WORDS_BIGENDIAN
+#if WORDS_BIGENDIAN
     return (b&0xFFFF) + (a<<16);
  #else
     return (a&0xFFFF) + (b<<16);
  #endif
  }
-
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
-#define array_non_zero_int array_non_zero_int
-static ALWAYS_INLINE int array_non_zero_int( int16_t *v, int i_count )
+static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
  {
-    if(i_count == 8)
-        return !!M64( &v[0] );
-    else if(i_count == 16)
-        return !!(M64( &v[0] ) | M64( &v[4] ));
-    else if(i_count == 32)
-        return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] ));
-    else
-    {
-        int i;
-        for( i = 0; i < i_count; i+=4 )
-            if( M64( &v[i] ) ) return 1;
-        return 0;
-    }
+#if WORDS_BIGENDIAN
+   return b + ((uint64_t)a<<32);
+#else
+   return a + ((uint64_t)b<<32);
+#endif
  }
+
+#if HIGH_BIT_DEPTH
+#   define pack_pixel_1to2 pack16to32
+#   define pack_pixel_2to4 pack32to64
+#else
+#   define pack_pixel_1to2 pack8to16
+#   define pack_pixel_2to4 pack16to32
+#endif
+
  static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
  {
      const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
@@ -372,28 +417,26 @@ static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx )
      int i_ret = za + zb;
  
      if( i_ret < 0x80 )
-    {
          i_ret = ( i_ret + 1 ) >> 1;
-    }
      return i_ret & 0x7f;
  }
+
+/* intra and skip are disallowed, p8x8 is conditional. */
+static const uint8_t x264_transform_allowed[X264_MBTYPE_MAX] =
+{
+    0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0
+};
+
  /* x264_mb_transform_8x8_allowed:
   *      check whether any partition is smaller than 8x8 (or at least
   *      might be, according to just partition type.)
   *      doesn't check for cbp */
  static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h )
  {
-    // intra and skip are disallowed
-    // large partitions are allowed
-    // direct and 8x8 are conditional
-    static const uint8_t partition_tab[X264_MBTYPE_MAX] = {
-        0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0,
-    };
-
      if( !h->pps->b_transform_8x8_mode )
          return 0;
      if( h->mb.i_type != P_8x8 )
-        return partition_tab[h->mb.i_type];
+        return x264_transform_allowed[h->mb.i_type];
      return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
  }