4:2:2 encoding support

author Henrik Gramner <hengar-6@student.ltu.se>

Fri, 26 Aug 2011 13:57:04 +0000 (15:57 +0200)

committer Fiona Glaser <fiona@x264.com>

Wed, 21 Sep 2011 16:54:44 +0000 (09:54 -0700)
author Henrik Gramner <hengar-6@student.ltu.se>
Fri, 26 Aug 2011 13:57:04 +0000 (15:57 +0200)
committer Fiona Glaser <fiona@x264.com>
Wed, 21 Sep 2011 16:54:44 +0000 (09:54 -0700)
diff --git a/AUTHORS b/AUTHORS

index 8acaba477fa5a99775ffccc88e59495dfc057a35..60ffb621f96b2294e9b715cf17f340dbb2d0b82f 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -42,6 +42,11 @@ E: gpoirier CHEZ mplayerhq POINT hu
  D: Altivec optimizations
  S: Brittany, France
  
+N: Henrik Gramner
+E: hengar-6 AT student DOT ltu DOT se
+D: 4:2:2 chroma subsampling, x86 asm
+S: Sweden
+
  N: Fiona Glaser
  E: fiona AT x264 DOT com
  D: x86 asm, 1pass VBV, adaptive quantization, inline asm
diff --git a/common/bitstream.h b/common/bitstream.h

index 6300e52aaf95c8fc0b76550adb97633604d4e05e..058db8b4380655c3e8a9d2a955df98c0b953e3b0 100644 (file)
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -60,10 +60,11 @@ typedef struct
      uint8_t run[16];
  } x264_run_level_t;
  
-extern const vlc_t x264_coeff0_token[5];
-extern const vlc_t x264_coeff_token[5][16][4];
+extern const vlc_t x264_coeff0_token[6];
+extern const vlc_t x264_coeff_token[6][16][4];
  extern const vlc_t x264_total_zeros[15][16];
-extern const vlc_t x264_total_zeros_dc[3][4];
+extern const vlc_t x264_total_zeros_2x2_dc[3][4];
+extern const vlc_t x264_total_zeros_2x4_dc[7][8];
  extern const vlc_t x264_run_before[7][16];
  
  typedef struct
diff --git a/common/common.c b/common/common.c

index ce076e59947aed2e0a266eaa9ff11c14afcb99ae..4c978d3c681ba20b3e34d13e8fa3dbc93985594f 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -426,21 +426,57 @@ void x264_param_apply_fastfirstpass( x264_param_t *param )
      }
  }
  
+static int profile_string_to_int( const char *str )
+{
+    if( !strcasecmp( str, "baseline" ) )
+        return PROFILE_BASELINE;
+    if( !strcasecmp( str, "main" ) )
+        return PROFILE_MAIN;
+    if( !strcasecmp( str, "high" ) )
+        return PROFILE_HIGH;
+    if( !strcasecmp( str, "high10" ) )
+        return PROFILE_HIGH10;
+    if( !strcasecmp( str, "high422" ) )
+        return PROFILE_HIGH422;
+    if( !strcasecmp( str, "high444" ) )
+        return PROFILE_HIGH444_PREDICTIVE;
+    return -1;
+}
+
  int x264_param_apply_profile( x264_param_t *param, const char *profile )
  {
      if( !profile )
          return 0;
  
-#if BIT_DEPTH > 8
-    if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
-        !strcasecmp( profile, "high" ) )
+    int p = profile_string_to_int( profile );
+    if( p < 0 )
      {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
+        x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
+        (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0)) )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH10 && BIT_DEPTH > 8 )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, BIT_DEPTH );
          return -1;
      }
-#endif
  
-    if( !strcasecmp( profile, "baseline" ) )
+    if( p == PROFILE_BASELINE )
      {
          param->analyse.b_transform_8x8 = 0;
          param->b_cabac = 0;
@@ -459,27 +495,12 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
              return -1;
          }
      }
-    else if( !strcasecmp( profile, "main" ) )
+    else if( p == PROFILE_MAIN )
      {
          param->analyse.b_transform_8x8 = 0;
          param->i_cqm_preset = X264_CQM_FLAT;
          param->psz_cqm_file = NULL;
      }
-    else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
-    {
-        /* Default */
-    }
-    else
-    {
-        x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
-        return -1;
-    }
-    if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
-        (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0) )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
-        return -1;
-    }
      return 0;
  }
  
@@ -1075,6 +1096,9 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
          [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
          [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
          [X264_CSP_NV12] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
+        [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
+        [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
+        [X264_CSP_NV16] = { 2, { 256*1, 256*1 },        { 256*1, 256*1 },       },
          [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
          [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
          [X264_CSP_BGR]  = { 1, { 256*3 },               { 256*1 },              },
diff --git a/common/common.h b/common/common.h

index a4e1cf96a413849bd0cc91c5de0e1ce50d727574..d1f830f65e3d6d622994364fefd73785155cf303 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -40,6 +40,9 @@
  #define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
  #define FIX8(f) ((int)(f*(1<<8)+.5))
  #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+#define CHROMA_FORMAT h->sps->i_chroma_format_idc
+#define CHROMA_SIZE(s) ((s)>>(h->mb.chroma_h_shift+h->mb.chroma_v_shift))
+#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
  
  #define CHECKED_MALLOC( var, size )\
  do {\
@@ -56,7 +59,7 @@ do {\
  #define X264_BFRAME_MAX 16
  #define X264_REF_MAX 16
  #define X264_THREAD_MAX 128
-#define X264_PCM_COST ((384<<CHROMA444)*BIT_DEPTH+16)
+#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
  #define X264_LOOKAHEAD_MAX 250
  #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
  #define QP_MAX_SPEC (51+QP_BD_OFFSET)
@@ -102,7 +105,7 @@ do {\
  #   define PARAM_INTERLACED 0
  #endif
  
-#define CHROMA444 (h->sps->i_chroma_format_idc == 3)
+#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
  
  /* Unions for type-punning.
   * Mn: load or store n bits, aligned, native-endian
@@ -565,7 +568,7 @@ struct x264_t
      struct
      {
          ALIGNED_16( dctcoef luma16x16_dc[3][16] );
-        ALIGNED_16( dctcoef chroma_dc[2][4] );
+        ALIGNED_16( dctcoef chroma_dc[2][8] );
          // FIXME share memory?
          ALIGNED_16( dctcoef luma8x8[12][64] );
          ALIGNED_16( dctcoef luma4x4[16*3][16] );
@@ -578,6 +581,10 @@ struct x264_t
          int     i_mb_height;
          int     i_mb_count;                 /* number of mbs in a frame */
  
+        /* Chroma subsampling */
+        int     chroma_h_shift;
+        int     chroma_v_shift;
+
          /* Strides */
          int     i_mb_stride;
          int     i_b8_stride;
@@ -882,6 +889,8 @@ struct x264_t
      ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
      uint32_t nr_count_buf[2][4];
  
+    uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
+
      /* Buffers that are allocated per-thread even in sliced threads. */
      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
      pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
@@ -891,9 +900,11 @@ struct x264_t
  
      /* CPU functions dependents */
      x264_predict_t      predict_16x16[4+3];
-    x264_predict_t      predict_8x8c[4+3];
      x264_predict8x8_t   predict_8x8[9+3];
      x264_predict_t      predict_4x4[9+3];
+    x264_predict_t      predict_chroma[4+3];
+    x264_predict_t      predict_8x8c[4+3];
+    x264_predict_t      predict_8x16c[4+3];
      x264_predict_8x8_filter_t predict_8x8_filter;
  
      x264_pixel_function_t pixf;
diff --git a/common/dct.c b/common/dct.c

index 9653ee47245fd4637ff90424935d832e9e3500a7..cf8a23517bb8a41a27f7f09fab36081783074a19 100644 (file)
--- a/common/dct.c
+++ b/common/dct.c
@@ -5,6 +5,7 @@
   *
   * Authors: Loren Merritt <lorenm@u.washington.edu>
   *          Laurent Aimar <fenrir@via.ecp.fr>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -100,6 +101,42 @@ static void idct4x4dc( dctcoef d[16] )
      }
  }
  
+static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+{
+    int a0 = dct4x4[0][0] + dct4x4[1][0];
+    int a1 = dct4x4[2][0] + dct4x4[3][0];
+    int a2 = dct4x4[4][0] + dct4x4[5][0];
+    int a3 = dct4x4[6][0] + dct4x4[7][0];
+    int a4 = dct4x4[0][0] - dct4x4[1][0];
+    int a5 = dct4x4[2][0] - dct4x4[3][0];
+    int a6 = dct4x4[4][0] - dct4x4[5][0];
+    int a7 = dct4x4[6][0] - dct4x4[7][0];
+    int b0 = a0 + a1;
+    int b1 = a2 + a3;
+    int b2 = a4 + a5;
+    int b3 = a6 + a7;
+    int b4 = a0 - a1;
+    int b5 = a2 - a3;
+    int b6 = a4 - a5;
+    int b7 = a6 - a7;
+    dct[0] = b0 + b1;
+    dct[1] = b2 + b3;
+    dct[2] = b0 - b1;
+    dct[3] = b2 - b3;
+    dct[4] = b4 - b5;
+    dct[5] = b6 - b7;
+    dct[6] = b4 + b5;
+    dct[7] = b6 + b7;
+    dct4x4[0][0] = 0;
+    dct4x4[1][0] = 0;
+    dct4x4[2][0] = 0;
+    dct4x4[3][0] = 0;
+    dct4x4[4][0] = 0;
+    dct4x4[5][0] = 0;
+    dct4x4[6][0] = 0;
+    dct4x4[7][0] = 0;
+}
+
  static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
                                    pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
  {
@@ -164,14 +201,10 @@ static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
  
  static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
  {
-    dctcoef d[16];
      int sum = 0;
-
-    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
-
-    sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
-    sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
-
+    for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
+        sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
+             - pix2[0] - pix2[1] - pix2[2] - pix2[3];
      return sum;
  }
  
@@ -188,11 +221,49 @@ static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
      int d2 = dct[0] - dct[1];
      int d3 = dct[2] - dct[3];
      dct[0] = d0 + d1;
-    dct[2] = d2 + d3;
      dct[1] = d0 - d1;
+    dct[2] = d2 + d3;
      dct[3] = d2 - d3;
  }
  
+static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
+{
+    int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
+    int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
+    int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
+    int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
+    int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
+    int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
+    int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
+    int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
+
+    /* 2x4 DC transform */
+    int b0 = a0 + a1;
+    int b1 = a2 + a3;
+    int b2 = a4 + a5;
+    int b3 = a6 + a7;
+    int b4 = a0 - a1;
+    int b5 = a2 - a3;
+    int b6 = a4 - a5;
+    int b7 = a6 - a7;
+    a0 = b0 + b1;
+    a1 = b2 + b3;
+    a2 = b4 + b5;
+    a3 = b6 + b7;
+    a4 = b0 - b1;
+    a5 = b2 - b3;
+    a6 = b4 - b5;
+    a7 = b6 - b7;
+    dct[0] = a0 + a1;
+    dct[1] = a2 + a3;
+    dct[2] = a0 - a1;
+    dct[3] = a2 - a3;
+    dct[4] = a4 - a5;
+    dct[5] = a6 - a7;
+    dct[6] = a4 + a5;
+    dct[7] = a6 + a7;
+}
+
  static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
  {
      dctcoef d[16];
@@ -408,6 +479,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
      dctf->add8x8_idct   = add8x8_idct;
      dctf->add8x8_idct_dc = add8x8_idct_dc;
  
+    dctf->sub8x16_dct_dc = sub8x16_dct_dc;
+
      dctf->sub16x16_dct  = sub16x16_dct;
      dctf->add16x16_idct = add16x16_idct;
      dctf->add16x16_idct_dc = add16x16_idct_dc;
@@ -421,6 +494,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
      dctf->dct4x4dc  = dct4x4dc;
      dctf->idct4x4dc = idct4x4dc;
  
+    dctf->dct2x4dc = dct2x4dc;
+
  #if HIGH_BIT_DEPTH
  #if HAVE_MMX
      if( cpu&X264_CPU_MMX )
diff --git a/common/dct.h b/common/dct.h

index a764e491b2eead9f0f746f08980c3523222697b2..044ad1e149e9e680d4156118a671f971f6138b9b 100644 (file)
--- a/common/dct.h
+++ b/common/dct.h
@@ -104,6 +104,8 @@ typedef struct
      void (*add8x8_idct)  ( pixel *p_dst, dctcoef dct[4][16] );
      void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] );
  
+    void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
+
      void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
      void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] );
      void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] );
@@ -117,6 +119,8 @@ typedef struct
      void (*dct4x4dc) ( dctcoef d[16] );
      void (*idct4x4dc)( dctcoef d[16] );
  
+    void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
  } x264_dct_function_t;
  
  typedef struct
diff --git a/common/deblock.c b/common/deblock.c

index 22d37635f5a4ca2488e1caf63975a7526ecb86a7..a1108b209809ed2e57677f7c2c0cfe1cc7ea585d 100644 (file)
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -6,6 +6,7 @@
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
   *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -119,7 +120,7 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp
              deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
      }
  }
-static void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
  {
      for( int d = 0; d < 8; d++, pix += stride )
          deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
@@ -147,33 +148,42 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al
          pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
      }
  }
-static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
  {
      for( int i = 0; i < 4; i++ )
      {
          int tc = tc0[i];
          if( tc <= 0 )
          {
-            pix += 2*ystride;
+            pix += height*ystride;
              continue;
          }
-        for( int d = 0; d < 2; d++, pix += ystride-2 )
-        for( int e = 0; e < 2; e++, pix++ )
-            deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
+        for( int d = 0; d < height; d++, pix += ystride-2 )
+            for( int e = 0; e < 2; e++, pix++ )
+                deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
      }
  }
-static void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
  {
      for( int i = 0; i < 4; i++, pix += stride )
          deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] );
  }
+static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+    for( int i = 0; i < 8; i++, pix += stride )
+        deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] );
+}
  static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
  {
-    deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 );
+    deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
  }
  static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
  {
-    deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 );
+    deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
+}
+static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+    deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
  }
  
  static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
@@ -220,7 +230,7 @@ static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, i
      for( int d = 0; d < 16; d++, pix += ystride )
          deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
  }
-static void deblock_v_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
+static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
  {
      for( int d = 0; d < 8; d++, pix += ystride )
          deblock_edge_luma_intra_c( pix, 1, alpha, beta );
@@ -247,24 +257,33 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride,
          pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
      }
  }
-static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir )
+static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
  {
-    for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 )
-    for( int e = 0; e < (dir?1:2); e++, pix++ )
-        deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
+    for( int d = 0; d < height; d++, pix += ystride-2 )
+        for( int e = 0; e < width; e++, pix++ )
+            deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
  }
-static void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
  {
      for( int i = 0; i < 4; i++, pix += stride )
          deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
  }
+static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+{
+    for( int i = 0; i < 8; i++, pix += stride )
+        deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
+}
  static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
  {
-    deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 );
+    deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
  }
  static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
  {
-    deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 );
+    deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
+}
+static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
+{
+    deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
  }
  
  static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
@@ -375,6 +394,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
      int stridey   = h->fdec->i_stride[0];
      int strideuv  = h->fdec->i_stride[1];
      int chroma444 = CHROMA444;
+    int chroma_height = 16 >> h->mb.chroma_v_shift;
      intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
  
      for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
@@ -388,12 +408,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];
  
          pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
-        pixel *pixuv = h->fdec->plane[1] + (8<<chroma444)*mb_y*strideuv + 16*mb_x;
+        pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;
  
          if( mb_y & MB_INTERLACED )
          {
              pixy -= 15*stridey;
-            pixuv -= ((8<<chroma444)-1)*strideuv;
+            pixuv -= (chroma_height-1)*strideuv;
          }
  
          int stride2y  = stridey << MB_INTERLACED;
@@ -405,22 +425,33 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          #define FILTER( intra, dir, edge, qp, chroma_qp )\
          do\
          {\
-            deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
-                                 stride2y, bs[dir][edge], qp, a, b, 0,\
-                                 h->loopf.deblock_luma##intra[dir] );\
-            if( chroma444 )\
+            if( !(edge & 1) || !transform_8x8 )\
              {\
-                deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
-                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
-                                     h->loopf.deblock_luma##intra[dir] );\
-                deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
-                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+                deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
+                                     stride2y, bs[dir][edge], qp, a, b, 0,\
                                       h->loopf.deblock_luma##intra[dir] );\
+                if( CHROMA_FORMAT == CHROMA_444 )\
+                {\
+                    deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
+                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+                                         h->loopf.deblock_luma##intra[dir] );\
+                    deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
+                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+                                         h->loopf.deblock_luma##intra[dir] );\
+                }\
+                else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\
+                {\
+                    deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
+                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
+                                         h->loopf.deblock_chroma##intra[dir] );\
+                }\
              }\
-            else if( !(edge & 1) )\
-                deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\
+            if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\
+            {\
+                deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
                                       stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
                                       h->loopf.deblock_chroma##intra[dir] );\
+            }\
          } while(0)
  
          if( h->mb.i_neighbour & MB_LEFT )
@@ -431,9 +462,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                  int chroma_qp[2];
                  int left_qp[2];
                  x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
-                x264_deblock_inter_t chroma_deblock = chroma444 ? h->loopf.deblock_luma_mbaff : h->loopf.deblock_chroma_mbaff;
+                x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
                  x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
-                x264_deblock_intra_t chroma_intra_deblock = chroma444 ? h->loopf.deblock_luma_intra_mbaff : h->loopf.deblock_chroma_intra_mbaff;
+                x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
                  int c = chroma444 ? 0 : 1;
  
                  left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
@@ -453,8 +484,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                  }
  
                  int offy = MB_INTERLACED ? 4 : 0;
-                int offuv = MB_INTERLACED ? 3 : 0;
-                if( chroma444 ) offuv = offy;
+                int offuv = MB_INTERLACED ? 4-h->mb.chroma_v_shift : 0;
                  left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
                  luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
                  chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
@@ -486,9 +516,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          }
          if( !first_edge_only )
          {
-            if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc );
-                                 FILTER( , 0, 2, qp, qpc );
-            if( !transform_8x8 ) FILTER( , 0, 3, qp, qpc );
+            FILTER( , 0, 1, qp, qpc );
+            FILTER( , 0, 2, qp, qpc );
+            FILTER( , 0, 3, qp, qpc );
          }
  
          if( h->mb.i_neighbour & MB_TOP )
@@ -540,9 +570,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  
          if( !first_edge_only )
          {
-            if( !transform_8x8 ) FILTER( , 1, 1, qp, qpc );
-                                 FILTER( , 1, 2, qp, qpc );
-            if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
+            FILTER( , 1, 1, qp, qpc );
+            FILTER( , 1, 2, qp, qpc );
+            FILTER( , 1, 3, qp, qpc );
          }
  
          #undef FILTER
@@ -553,7 +583,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
   * TODO:
   *  deblock macroblock edges
   *  support analysis partitions smaller than 16x16
- *  deblock chroma for 4:2:0
+ *  deblock chroma for 4:2:0/4:2:2
   *  handle duplicate refs correctly
   *  handle cavlc+8x8dct correctly
   */
@@ -683,15 +713,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
      pf->deblock_luma[1] = deblock_v_luma_c;
      pf->deblock_luma[0] = deblock_h_luma_c;
      pf->deblock_chroma[1] = deblock_v_chroma_c;
-    pf->deblock_chroma[0] = deblock_h_chroma_c;
+    pf->deblock_h_chroma_420 = deblock_h_chroma_c;
+    pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
      pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
      pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
      pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
-    pf->deblock_chroma_intra[0] = deblock_h_chroma_intra_c;
-    pf->deblock_luma_mbaff = deblock_v_luma_mbaff_c;
-    pf->deblock_chroma_mbaff = deblock_v_chroma_mbaff_c;
-    pf->deblock_luma_intra_mbaff = deblock_v_luma_intra_mbaff_c;
-    pf->deblock_chroma_intra_mbaff = deblock_v_chroma_intra_mbaff_c;
+    pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
+    pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
+    pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
+    pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
+    pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
+    pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
+    pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
+    pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
      pf->deblock_strength = deblock_strength_c;
  
  #if HAVE_MMX
@@ -701,11 +735,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
          pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
          pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
          pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
-        pf->deblock_chroma[0] = x264_deblock_h_chroma_mmx2;
+        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
          pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
          pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
          pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
-        pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmx2;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
  #endif
          pf->deblock_strength = x264_deblock_strength_mmx2;
          if( cpu&X264_CPU_SSE2 )
@@ -716,11 +750,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
                  pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
                  pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
                  pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
-                pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2;
+                pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
                  pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
                  pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
                  pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
-                pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2;
+                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
              }
          }
          if( cpu&X264_CPU_SSSE3 )
@@ -733,11 +767,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
                  pf->deblock_luma[1] = x264_deblock_v_luma_avx;
                  pf->deblock_luma[0] = x264_deblock_h_luma_avx;
                  pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
-                pf->deblock_chroma[0] = x264_deblock_h_chroma_avx;
+                pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
                  pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
                  pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
                  pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
-                pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx;
+                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
              }
          }
      }
@@ -758,7 +792,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
          pf->deblock_luma[1] = x264_deblock_v_luma_neon;
          pf->deblock_luma[0] = x264_deblock_h_luma_neon;
  //      pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
-//      pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
+//      pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
     }
  #endif
  #endif // !HIGH_BIT_DEPTH
diff --git a/common/frame.c b/common/frame.c

index b95c2a868252b0313710272f9c0f5c6b9d0501f3..594aeccc4579b7b27b183802a9a857dae05b0529 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -50,6 +50,10 @@ static int x264_frame_internal_csp( int external_csp )
          case X264_CSP_I420:
          case X264_CSP_YV12:
              return X264_CSP_NV12;
+        case X264_CSP_NV16:
+        case X264_CSP_I422:
+        case X264_CSP_YV16:
+            return X264_CSP_NV16;
          case X264_CSP_I444:
          case X264_CSP_YV24:
          case X264_CSP_BGR:
@@ -66,11 +70,10 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      x264_frame_t *frame;
      int i_csp = x264_frame_internal_csp( h->param.i_csp );
      int i_mb_count = h->mb.i_mb_count;
-    int i_stride, i_width, i_lines;
+    int i_stride, i_width, i_lines, luma_plane_count;
      int i_padv = PADV << PARAM_INTERLACED;
      int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
      int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
-    int luma_plane_count = i_csp == X264_CSP_NV12 ? 1 : 3;
  
      CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  
@@ -79,18 +82,20 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      i_lines  = h->mb.i_mb_height*16;
      i_stride = align_stride( i_width + 2*PADH, align, disalign );
  
-    if( i_csp == X264_CSP_NV12 )
+    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
      {
+        luma_plane_count = 1;
          frame->i_plane = 2;
          for( int i = 0; i < 2; i++ )
          {
              frame->i_width[i] = i_width >> i;
-            frame->i_lines[i] = i_lines >> i;
+            frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
              frame->i_stride[i] = i_stride;
          }
      }
      else if( i_csp == X264_CSP_I444 )
      {
+        luma_plane_count = 3;
          frame->i_plane = 3;
          for( int i = 0; i < 3; i++ )
          {
@@ -130,15 +135,16 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  
      frame->orig = frame;
  
-    if( i_csp == X264_CSP_NV12 )
+    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
      {
-        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
+        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
+        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
          CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
          if( PARAM_INTERLACED )
          {
              CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH;
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
          }
      }
  
@@ -367,23 +373,25 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
      }
      else
      {
+        int v_shift = h->mb.chroma_v_shift;
          get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
          h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
                            stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
-        if( i_csp == X264_CSP_NV12 )
+        if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
          {
-            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 );
+            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
              h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
-                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>1 );
+                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
          }
-        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_YV12 )
+        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
          {
-            get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
-            get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
+            int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
+            get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
+            get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
              h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
                                           (pixel*)pix[1], stride[1]/sizeof(pixel),
                                           (pixel*)pix[2], stride[2]/sizeof(pixel),
-                                         h->param.i_width>>1, h->param.i_height>>1 );
+                                         h->param.i_width>>1, h->param.i_height>>v_shift );
          }
          else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
          {
@@ -478,33 +486,34 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
          return;
      for( int i = 0; i < frame->i_plane; i++ )
      {
-        int shift = i && !CHROMA444;
+        int h_shift = i && h->mb.chroma_h_shift;
+        int v_shift = i && h->mb.chroma_v_shift;
          int stride = frame->i_stride[i];
          int width = 16*h->mb.i_mb_width;
-        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> shift;
+        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
          int padh = PADH;
-        int padv = PADV >> shift;
+        int padv = PADV >> v_shift;
          // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
          if( b_end && !b_start )
-            height += 4 >> (shift + SLICE_MBAFF);
+            height += 4 >> (v_shift + SLICE_MBAFF);
          pixel *pix;
          if( SLICE_MBAFF )
          {
              // border samples for each field are extended separately
-            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
-            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, shift );
-            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, shift );
+            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
+            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
  
-            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> shift;
+            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
              if( b_end && !b_start )
-                height += 4 >> shift;
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
+                height += 4 >> v_shift;
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
          }
          else
          {
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
          }
      }
  }
@@ -545,9 +554,9 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
  
  void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
  {
-    int shift = !CHROMA444;
-    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>shift,
-                         PADH, PADV>>shift, 1, 1, shift );
+    int v_shift = h->mb.chroma_v_shift;
+    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
+                         PADH, PADV>>v_shift, 1, 1, h->mb.chroma_h_shift );
  }
  
  void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
@@ -555,17 +564,18 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
      for( int i = 0; i < frame->i_plane; i++ )
      {
          int i_width = h->param.i_width;
-        int shift = i && !CHROMA444;
-        int i_height = h->param.i_height >> shift;
+        int h_shift = i && h->mb.chroma_h_shift;
+        int v_shift = i && h->mb.chroma_v_shift;
+        int i_height = h->param.i_height >> v_shift;
          int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
-        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
  
          if( i_padx )
          {
              for( int y = 0; y < i_height; y++ )
                  pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
-                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-shift],
-                              i_padx>>shift, sizeof(pixel)<<shift );
+                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
+                              i_padx>>h_shift, sizeof(pixel)<<h_shift );
          }
          if( i_pady )
          {
@@ -581,10 +591,10 @@ void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
  {
      for( int i = 0; i < h->fenc->i_plane; i++ )
      {
-        int shift = i && !CHROMA444;
+        int v_shift = i && h->mb.chroma_v_shift;
          int stride = h->fenc->i_stride[i];
-        int height = h->param.i_height >> shift;
-        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+        int height = h->param.i_height >> v_shift;
+        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
          pixel *fenc = h->fenc->plane[i] + 16*mb_x;
          for( int y = height; y < height + pady; y++ )
              memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) );
diff --git a/common/frame.h b/common/frame.h

index 77af60d151d2ded4ff88e7a298581604367cf247..a13e05b4a27b25242d9965c94609cd5b34d41b44 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -181,12 +181,20 @@ typedef struct
  {
      x264_deblock_inter_t deblock_luma[2];
      x264_deblock_inter_t deblock_chroma[2];
+    x264_deblock_inter_t deblock_h_chroma_420;
+    x264_deblock_inter_t deblock_h_chroma_422;
      x264_deblock_intra_t deblock_luma_intra[2];
      x264_deblock_intra_t deblock_chroma_intra[2];
+    x264_deblock_intra_t deblock_h_chroma_420_intra;
+    x264_deblock_intra_t deblock_h_chroma_422_intra;
      x264_deblock_inter_t deblock_luma_mbaff;
      x264_deblock_inter_t deblock_chroma_mbaff;
+    x264_deblock_inter_t deblock_chroma_420_mbaff;
+    x264_deblock_inter_t deblock_chroma_422_mbaff;
      x264_deblock_intra_t deblock_luma_intra_mbaff;
      x264_deblock_intra_t deblock_chroma_intra_mbaff;
+    x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
+    x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
      void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                                 int bframe );
diff --git a/common/macroblock.c b/common/macroblock.c

index 7c524ff011fd8f57e621dd5b479b26a515fa3c75..f985e772f0dcd77f6aaea7f98e043b5d6652a257 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -6,6 +6,7 @@
   * Authors: Fiona Glaser <fiona@x264.com>
   *          Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -50,23 +51,27 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
      }
      else
      {
-        // chroma is offset if MCing from a field of opposite parity
-        if( MB_INTERLACED & i_ref )
+        int v_shift = h->mb.chroma_v_shift;
+        // Chroma in 4:2:0 is offset if MCing from a field of opposite parity
+        if( v_shift & MB_INTERLACED & i_ref )
              mvy += (h->mb.i_mb_y & 1)*4 - 2;
  
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        height = 4*height >> v_shift;
+
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                           h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, mvy, 2*width, 2*height );
+                         mvx, 2*mvy>>v_shift, 2*width, height );
  
          if( h->sh.weight[i_ref][1].weightfn )
-            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][1], height*2 );
+            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][1], height );
          if( h->sh.weight[i_ref][2].weightfn )
-            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][2],height*2 );
+            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][2], height );
      }
  }
  static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
@@ -85,13 +90,15 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
      }
      else
      {
-        if( MB_INTERLACED & i_ref )
+        int v_shift = h->mb.chroma_v_shift;
+        if( v_shift & MB_INTERLACED & i_ref )
              mvy += (h->mb.i_mb_y & 1)*4 - 2;
  
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                           h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, mvy, 2*width, 2*height );
+                         mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift );
      }
  }
  
@@ -128,17 +135,21 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
      }
      else
      {
-        if( MB_INTERLACED & i_ref0 )
+        int v_shift = h->mb.chroma_v_shift;
+        if( v_shift & MB_INTERLACED & i_ref0 )
              mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
-        if( MB_INTERLACED & i_ref1 )
+        if( v_shift & MB_INTERLACED & i_ref1 )
              mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
  
          h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
-                         mvx0, mvy0, 2*width, 2*height );
+                         mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift );
          h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
-                         mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
+                         mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift );
+
+        int chromapix = h->luma2chroma_pixel[i_mode];
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0,   16, tmp1,   16, weight );
+        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
      }
  }
  
@@ -301,7 +312,9 @@ int x264_macroblock_cache_allocate( x264_t *h )
          }
          else
          {
-            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv);
+            /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4
+             * needs the same amount of space and 4:2:2 needs twice that much */
+            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv);
  
              if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
                  //smart can weight one ref and one offset -1 in 8-bit
@@ -491,6 +504,24 @@ void x264_macroblock_thread_init( x264_t *h )
                            (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
      h->mb.i_mb_prev_xy = -1;
  
+    /*          4:2:0                      4:2:2                      4:4:4
+     * fdec            fenc       fdec            fenc       fdec            fenc
+     * y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       U U V V    y Y Y Y Y       U U V V    y Y Y Y Y       U U U U
+     * u u u   v v v   U U V V    u u u   v v v   U U V V    u u u u u u u   U U U U
+     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
+     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
+     *                            u U U   v V V              u U U U U       V V V V
+     *                            u U U   v V V              u U U U U       V V V V
+     *                                                       v v v v v v v   V V V V
+     *                                                       v V V V V       V V V V
+     *                                                       v V V V V
+     *                                                       v V V V V
+     *                                                       v V V V V
+     */
      h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
      h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
      h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
@@ -500,16 +531,6 @@ void x264_macroblock_thread_init( x264_t *h )
          h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
          h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
      }
-    /* fdec:      fenc:
-     * yyyyyyy
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * uuu vvv    UUVV
-     * uUU vVV    UUVV
-     * uUU vVV
-     */
      else
      {
          h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
@@ -522,7 +543,7 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
      int stride_y  = fenc->i_stride[0];
      int stride_uv = fenc->i_stride[1];
      int off_y  = 16 * i_mb_x + 16 * i_mb_y * stride_y;
-    int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv;
+    int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> h->mb.chroma_v_shift);
      h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
                           fenc->plane[1]+off_uv, stride_uv, i_mb_x );
  }
@@ -537,12 +558,12 @@ NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
  static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
  {
      int mb_interlaced = b_mbaff && MB_INTERLACED;
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16 >> h->mb.chroma_v_shift : 16;
      int i_stride = h->fdec->i_stride[i];
      int i_stride2 = i_stride << mb_interlaced;
      int i_pix_offset = mb_interlaced
-                     ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + w * mb_y * i_stride;
+                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+                     : 16 * mb_x + height * mb_y * i_stride;
      pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
      int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
      pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
@@ -554,7 +575,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
      h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
      if( b_chroma )
      {
-        h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
+        h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
          memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
          memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
          if( b_mbaff )
@@ -572,7 +593,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
      }
      if( b_mbaff )
      {
-        for( int j = 0; j < w; j++ )
+        for( int j = 0; j < height; j++ )
              if( b_chroma )
              {
                  h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
@@ -854,8 +875,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
  
          /* load non_zero_count */
          CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16+4 + 8*CHROMA444] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32+4 + 8*CHROMA444] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>h->mb.chroma_v_shift)] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>h->mb.chroma_v_shift)] );
  
          /* Finish the prefetching */
          for( int l = 0; l < lists; l++ )
@@ -906,16 +927,17 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
          h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];
  
-        if( CHROMA444 )
+        if( CHROMA_FORMAT >= CHROMA_422 )
          {
-            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32];
+            int offset = (4>>h->mb.chroma_h_shift) - 4;
+            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset];
          }
          else
          {
@@ -943,7 +965,7 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
          h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
          h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
-        if( CHROMA444 )
+        if( CHROMA_FORMAT >= CHROMA_422 )
          {
              h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
              h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
@@ -983,6 +1005,11 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          {
              x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
              x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE );
+                x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE );
+            }
              x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
          }
      }
@@ -1424,15 +1451,17 @@ void x264_macroblock_deblock_strength( x264_t *h )
      }
  
      /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
-    if( h->mb.b_transform_8x8 && (h->mb.i_cbp_luma&7) == 7 && !CHROMA444 )
+    if( h->mb.b_transform_8x8 && !CHROMA444 )
      {
-        M32( bs[0][0] ) = 0x02020202;
-        M32( bs[0][2] ) = 0x02020202;
-        M32( bs[0][4] ) = 0x02020202;
-        M32( bs[1][0] ) = 0x02020202;
-        M32( bs[1][2] ) = 0x02020202;
-        M32( bs[1][4] ) = 0x02020202;
-        return;
+        int cbp_mask = 0xf >> h->mb.chroma_v_shift;
+        if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
+        {
+            M32( bs[0][0] ) = 0x02020202;
+            M32( bs[0][2] ) = 0x02020202;
+            M32( bs[0][4] ) = 0x02020202;
+            memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */
+            return;
+        }
      }
  
      int neighbour_changed = 0;
@@ -1595,14 +1624,14 @@ void x264_macroblock_deblock_strength( x264_t *h )
  
  static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
  {
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
      int i_stride = h->fdec->i_stride[i];
      int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
      int i_pix_offset = (b_mbaff && MB_INTERLACED)
-                     ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + w * mb_y * i_stride;
+                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+                     : 16 * mb_x + height * mb_y * i_stride;
      if( b_chroma )
-        h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
+        h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height );
      else
          h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
  }
@@ -1622,8 +1651,9 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
      }
      else
      {
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7,   8*sizeof(pixel) );
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7,   8*sizeof(pixel) );
+        int backup_src = (15>>h->mb.chroma_v_shift) * FDEC_STRIDE;
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
      }
      if( b_mbaff )
      {
@@ -1639,7 +1669,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
              }
              else
              {
-                backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
+                if( CHROMA_FORMAT == CHROMA_420 )
+                    backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
                  memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
                  memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
              }
@@ -1650,8 +1681,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
          /* In progressive we update intra_border_backup in-place, so the topleft neighbor will
           * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
          h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
-        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7 + 8*CHROMA444];
-        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7 + 8*CHROMA444];
+        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
+        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
      }
  }
  
@@ -1744,7 +1775,7 @@ void x264_macroblock_cache_save( x264_t *h )
      CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
      CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
      CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
-    if( CHROMA444 )
+    if( CHROMA_FORMAT >= CHROMA_422 )
      {
          CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
          CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
@@ -1809,7 +1840,7 @@ void x264_macroblock_cache_save( x264_t *h )
          uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
          uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
          if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
-            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
+            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
          else
              h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
  
diff --git a/common/macroblock.h b/common/macroblock.h

index 7f5d5661ecb92c10caac3fcdaac01fcabfa19c77..12b90c6237e24ddf82e2b913f41dd724a030a3c5 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -397,15 +397,6 @@ static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
  #   define pack_pixel_2to4 pack16to32
  #endif
  
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef))
-#define array_non_zero_int array_non_zero_int
-static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count )
-{
-    for( int i = 0; i < i_count; i++ )
-        if( v[i] )
-            return 1;
-    return 0;
-}
  static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
  {
      const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
diff --git a/common/mc.c b/common/mc.c

index 5352be14c171e60a6dec1345748d8804912086ca..c2b77f58105ab437d11d67af6e2c86b1b9de4bda 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -90,9 +90,11 @@ PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
  PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
  PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
  PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
+PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
  PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
  PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
  PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
+PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
  PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
  PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
  
@@ -330,9 +332,9 @@ void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
      }
  }
  
-static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv )
+static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height )
  {
-    for( int y=0; y<8; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
+    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
          for( int x=0; x<8; x++ )
          {
              dst[2*x]   = srcu[x];
@@ -340,14 +342,14 @@ static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *s
          }
  }
  
-static void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
  {
-    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, 8 );
+    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
  }
  
-static void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
  {
-    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, 8 );
+    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
  }
  
  static void prefetch_fenc_null( pixel *pix_y, int stride_y,
@@ -467,6 +469,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
  {
      pf->mc_luma   = mc_luma;
      pf->get_ref   = get_ref;
+
      pf->mc_chroma = mc_chroma;
  
      pf->avg[PIXEL_16x16]= pixel_avg_16x16;
@@ -474,9 +477,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_8x16] = pixel_avg_8x16;
      pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
      pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
+    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
      pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
      pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
      pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
+    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
      pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
      pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
  
@@ -490,9 +495,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
      pf->copy[PIXEL_8x8]   = mc_copy_w8;
      pf->copy[PIXEL_4x4]   = mc_copy_w4;
  
-    pf->store_interleave_8x8x2  = store_interleave_8x8x2;
-    pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc;
-    pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec;
+    pf->store_interleave_chroma       = store_interleave_chroma;
+    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
+    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
  
      pf->plane_copy = x264_plane_copy_c;
      pf->plane_copy_interleave = x264_plane_copy_interleave_c;
diff --git a/common/mc.h b/common/mc.h

index 15a0a2541b903b94dbffc7962dcf596c32f198fd..09dda5579163a6161cf1e795edfc01993e8988a9 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -62,30 +62,27 @@ extern const x264_weight_t x264_weight_none[3];
  
  typedef struct
  {
-    void (*mc_luma)(pixel *dst, int i_dst, pixel **src, int i_src,
-                    int mvx, int mvy,
-                    int i_width, int i_height, const x264_weight_t *weight );
+    void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src,
+                     int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
  
      /* may round up the dimensions if they're not a power of 2 */
-    pixel* (*get_ref)(pixel *dst, int *i_dst, pixel **src, int i_src,
-                      int mvx, int mvy,
-                      int i_width, int i_height, const x264_weight_t *weight );
+    pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src,
+                       int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
  
      /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
       * so it must be run from left to right. */
-    void (*mc_chroma)(pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
-                      int mvx, int mvy,
-                      int i_width, int i_height );
+    void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
+                       int mvx, int mvy, int i_width, int i_height );
  
-    void (*avg[10])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
+    void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
  
      /* only 16x16, 8x8, and 4x4 defined */
      void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
      void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
  
-    void (*store_interleave_8x8x2)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-    void (*load_deinterleave_8x8x2_fenc)( pixel *dst, pixel *src, int i_src );
-    void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src );
+    void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height );
+    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height );
  
      void (*plane_copy)( pixel *dst, int i_dst,
                          pixel *src, int i_src, int w, int h );
diff --git a/common/pixel.c b/common/pixel.c

index 91dc1b87339974210cf1742f4b25a58ff4411356..b346681b4bd36a7d8e22acca4211457a85f96e06 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -68,10 +68,10 @@ PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
  PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
  PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
  PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
+PIXEL_SAD_C( x264_pixel_sad_4x16,   4, 16 )
  PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
  PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
  
-
  /****************************************************************************
   * pixel_ssd_WxH
   ****************************************************************************/
@@ -98,6 +98,7 @@ PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
  PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
  PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
  PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
+PIXEL_SSD_C( x264_pixel_ssd_4x16,   4, 16 )
  PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
  PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
  
@@ -169,11 +170,11 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pi
  /****************************************************************************
   * pixel_var_wxh
   ****************************************************************************/
-#define PIXEL_VAR_C( name, w ) \
+#define PIXEL_VAR_C( name, w, h ) \
  static uint64_t name( pixel *pix, int i_stride ) \
  {                                             \
      uint32_t sum = 0, sqr = 0;                \
-    for( int y = 0; y < w; y++ )              \
+    for( int y = 0; y < h; y++ )              \
      {                                         \
          for( int x = 0; x < w; x++ )          \
          {                                     \
@@ -185,32 +186,37 @@ static uint64_t name( pixel *pix, int i_stride ) \
      return sum + ((uint64_t)sqr << 32);       \
  }
  
-PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x16,   8, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8,  8 )
  
  /****************************************************************************
   * pixel_var2_wxh
   ****************************************************************************/
-static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd )
-{
-    uint32_t var = 0, sum = 0, sqr = 0;
-    for( int y = 0; y < 8; y++ )
-    {
-        for( int x = 0; x < 8; x++ )
-        {
-            int diff = pix1[x] - pix2[x];
-            sum += diff;
-            sqr += diff * diff;
-        }
-        pix1 += i_stride1;
-        pix2 += i_stride2;
-    }
-    sum = abs(sum);
-    var = sqr - ((uint64_t)sum * sum >> 6);
-    *ssd = sqr;
-    return var;
+#define PIXEL_VAR2_C( name, w, h ) \
+static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \
+{ \
+    uint32_t var = 0, sum = 0, sqr = 0; \
+    for( int y = 0; y < h; y++ ) \
+    { \
+        for( int x = 0; x < w; x++ ) \
+        { \
+            int diff = pix1[x] - pix2[x]; \
+            sum += diff; \
+            sqr += diff * diff; \
+        } \
+        pix1 += i_stride1; \
+        pix2 += i_stride2; \
+    } \
+    sum = abs(sum); \
+    var = sqr - ((uint64_t)sum * sum >> 6); \
+    *ssd = sqr; \
+    return var; \
  }
  
+PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16 )
+PIXEL_VAR2_C( x264_pixel_var2_8x8,  8,  8 )
+
  #if BIT_DEPTH > 8
      typedef uint32_t sum_t;
      typedef uint64_t sum2_t;
@@ -309,9 +315,9 @@ PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
  PIXEL_SATD_C( 16, 8,  x264_pixel_satd_8x4 )
  PIXEL_SATD_C( 8,  16, x264_pixel_satd_8x4 )
  PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
+PIXEL_SATD_C( 4,  16, x264_pixel_satd_4x4 )
  PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )
  
-
  static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
  {
      sum2_t tmp[8][4];
@@ -535,6 +541,8 @@ INTRA_MBCMP( sad,  4x4,   v, h, dc,  ,, _c )
  INTRA_MBCMP(satd,  4x4,   v, h, dc,  ,, _c )
  INTRA_MBCMP( sad,  8x8,  dc, h,  v, c,, _c )
  INTRA_MBCMP(satd,  8x8,  dc, h,  v, c,, _c )
+INTRA_MBCMP( sad,  8x16, dc, h,  v, c,, _c )
+INTRA_MBCMP(satd,  8x16, dc, h,  v, c,, _c )
  INTRA_MBCMP( sad, 16x16,  v, h, dc,  ,, _c )
  INTRA_MBCMP(satd, 16x16,  v, h, dc,  ,, _c )
  
@@ -754,23 +762,27 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  #define INIT7_NAME( name1, name2, cpu ) \
      INIT6_NAME( name1, name2, cpu ) \
      pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
+#define INIT8_NAME( name1, name2, cpu ) \
+    INIT7_NAME( name1, name2, cpu ) \
+    pixf->name1[PIXEL_4x16]  = x264_pixel_##name2##_4x16##cpu;
  #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
  #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
  #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
  #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
  #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
+#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
  
  #define INIT_ADS( cpu ) \
      pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
      pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
      pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
  
-    INIT7( sad, );
-    INIT7_NAME( sad_aligned, sad, );
+    INIT8( sad, );
+    INIT8_NAME( sad_aligned, sad, );
      INIT7( sad_x3, );
      INIT7( sad_x4, );
-    INIT7( ssd, );
-    INIT7( satd, );
+    INIT8( ssd, );
+    INIT8( satd, );
      INIT7( satd_x3, );
      INIT7( satd_x4, );
      INIT4( hadamard_ac, );
@@ -779,12 +791,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
      pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
      pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
      pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
+    pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16;
      pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;
+    pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16;
+    pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8;
  
      pixf->ssd_nv12_core = pixel_ssd_nv12_core;
      pixf->ssim_4x4x2_core = ssim_4x4x2_core;
      pixf->ssim_end4 = ssim_end4;
-    pixf->var2_8x8 = pixel_var2_8x8;
      pixf->vsad = pixel_vsad;
  
      pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
@@ -793,6 +807,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
      pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8;
      pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c;
      pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c;
+    pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c;
+    pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c;
      pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
      pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
  
@@ -813,7 +829,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
          pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
          pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
-        pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_mmx2;
  
          pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmx2;
          pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmx2;
@@ -837,8 +853,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
          pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
          pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
-        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
-        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_sse2;
+        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_sse2;
      }
      if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
      {
@@ -937,7 +953,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmx2;
          pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
          pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
-        pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
          pixf->vsad = x264_pixel_vsad_mmx2;
  
          if( cpu&X264_CPU_CACHELINE_32 )
@@ -986,7 +1002,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  #if ARCH_X86_64
          pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
  #endif
-        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
          pixf->vsad = x264_pixel_vsad_sse2;
      }
  
@@ -1072,7 +1088,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  #if ARCH_X86_64
          pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
  #endif
-        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
          if( cpu&X264_CPU_SHUFFLE_IS_FAST )
              pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
          if( cpu&X264_CPU_CACHELINE_64 )
@@ -1154,7 +1170,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
          pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
          pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-        pixf->var2_8x8          = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
  
          pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
          pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
diff --git a/common/pixel.h b/common/pixel.h

index c7ee0fbfc920685fef6a40ddec98eb35283bdee7..d2ea52f5fc301c385a67086736db86b4580dcae5 100644 (file)
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -5,6 +5,7 @@
   *
   * Authors: Loren Merritt <lorenm@u.washington.edu>
   *          Fiona Glaser <fiona@x264.com>
+            Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -42,22 +43,19 @@ enum
      PIXEL_8x4   = 4,
      PIXEL_4x8   = 5,
      PIXEL_4x4   = 6,
-    PIXEL_4x2   = 7,
-    PIXEL_2x4   = 8,
-    PIXEL_2x2   = 9,
+
+    /* Subsampled chroma only */
+    PIXEL_4x16  = 7,  /* 4:2:2 */
+    PIXEL_4x2   = 8,
+    PIXEL_2x8   = 9,  /* 4:2:2 */
+    PIXEL_2x4   = 10,
+    PIXEL_2x2   = 11,
  };
  
-static const struct
-{
-    int w;
-    int h;
-} x264_pixel_size[7] =
+static const struct { uint8_t w, h; } x264_pixel_size[12] =
  {
-    { 16, 16 },
-    { 16,  8 }, {  8, 16 },
-    {  8,  8 },
-    {  8,  4 }, {  4,  8 },
-    {  4,  4 }
+    { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 },
+    {  4, 16 }, {  4, 2 }, { 2,  8 }, { 2, 4 }, { 2, 2 },
  };
  
  static const uint8_t x264_size2pixel[5][5] =
@@ -69,23 +67,32 @@ static const uint8_t x264_size2pixel[5][5] =
      { 0, 0,        PIXEL_8x16, 0, PIXEL_16x16 }
  };
  
+static const uint8_t x264_luma2chroma_pixel[4][7] =
+{
+    { 0 },
+    { PIXEL_8x8,   PIXEL_8x4,  PIXEL_4x8,  PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */
+    { PIXEL_8x16,  PIXEL_8x8,  PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */
+    { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */
+};
+
  typedef struct
  {
-    x264_pixel_cmp_t  sad[7];
-    x264_pixel_cmp_t  ssd[7];
-    x264_pixel_cmp_t satd[7];
+    x264_pixel_cmp_t  sad[8];
+    x264_pixel_cmp_t  ssd[8];
+    x264_pixel_cmp_t satd[8];
      x264_pixel_cmp_t ssim[7];
      x264_pixel_cmp_t sa8d[4];
-    x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
-    x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */
-    x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
+    x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */
+    x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */
+    x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */
      x264_pixel_cmp_x3_t fpelcmp_x3[7];
      x264_pixel_cmp_x4_t fpelcmp_x4[7];
-    x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+    x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
      int (*vsad)( pixel *, int, int );
-    int (*var2_8x8)( pixel *, int, pixel *, int, int * );
  
      uint64_t (*var[4])( pixel *pix, int stride );
+    int (*var2[4])( pixel *pix1, int stride1,
+                    pixel *pix2, int stride2, int *ssd );
      uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
  
      void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
@@ -110,12 +117,18 @@ typedef struct
      void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] );
      void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] );
      void (*intra_sad_x3_16x16)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_8x8c)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_8x8c)   ( pixel *fenc, pixel *fdec, int res[3] );
      void (*intra_mbcmp_x3_4x4)  ( pixel *fenc, pixel *fdec, int res[3] );
      void (*intra_satd_x3_4x4)   ( pixel *fenc, pixel *fdec, int res[3] );
      void (*intra_sad_x3_4x4)    ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_sad_x3_chroma)  ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_satd_x3_8x16c)  ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_sad_x3_8x16c)   ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_mbcmp_x3_8x8c)  ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_satd_x3_8x8c)   ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_sad_x3_8x8c)    ( pixel *fenc, pixel *fdec, int res[3] );
      void (*intra_mbcmp_x3_8x8)  ( pixel *fenc, pixel edge[36], int res[3] );
      void (*intra_sa8d_x3_8x8)   ( pixel *fenc, pixel edge[36], int res[3] );
      void (*intra_sad_x3_8x8)    ( pixel *fenc, pixel edge[36], int res[3] );
diff --git a/common/predict.c b/common/predict.c

index 34798c2f01d6b391edc8019e9ea0079f4d519593..f5ed64260cd639201a876dc526eed3303fb8405b 100644 (file)
--- a/common/predict.c
+++ b/common/predict.c
@@ -6,6 +6,7 @@
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
   *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -151,7 +152,7 @@ void x264_predict_16x16_p_c( pixel *src )
  
  
  /****************************************************************************
- * 8x8 prediction for intra chroma block
+ * 8x8 prediction for intra chroma block (4:2:0)
   ****************************************************************************/
  
  static void x264_predict_8x8c_dc_128_c( pixel *src )
@@ -297,6 +298,167 @@ void x264_predict_8x8c_p_c( pixel *src )
      }
  }
  
+/****************************************************************************
+ * 8x16 prediction for intra chroma block (4:2:2)
+ ****************************************************************************/
+
+static void x264_predict_8x16c_dc_128_c( pixel *src )
+{
+    for( int y = 0; y < 16; y++ )
+    {
+        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+        src += FDEC_STRIDE;
+    }
+}
+static void x264_predict_8x16c_dc_left_c( pixel *src )
+{
+    for( int i = 0; i < 4; i++ )
+    {
+        int dc = 0;
+
+        for( int y = 0; y < 4; y++ )
+            dc += src[y*FDEC_STRIDE - 1];
+
+        pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 );
+
+        for( int y = 0; y < 4; y++ )
+        {
+            MPIXEL_X4( src+0 ) = dcsplat;
+            MPIXEL_X4( src+4 ) = dcsplat;
+            src += FDEC_STRIDE;
+        }
+    }
+}
+static void x264_predict_8x16c_dc_top_c( pixel *src )
+{
+    int dc0 = 0, dc1 = 0;
+
+    for(int  x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - FDEC_STRIDE];
+        dc1 += src[x + 4 - FDEC_STRIDE];
+    }
+    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+
+    for( int y = 0; y < 16; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc0splat;
+        MPIXEL_X4( src+4 ) = dc1splat;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_dc_c( pixel *src )
+{
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0;
+
+    /*
+          s0 s1
+       s2
+       s3
+       s4
+       s5
+    */
+    for( int i = 0; i < 4; i++ )
+    {
+        s0 += src[i+0 - FDEC_STRIDE];
+        s1 += src[i+4 - FDEC_STRIDE];
+        s2 += src[-1 + (i+0)  * FDEC_STRIDE];
+        s3 += src[-1 + (i+4)  * FDEC_STRIDE];
+        s4 += src[-1 + (i+8)  * FDEC_STRIDE];
+        s5 += src[-1 + (i+12) * FDEC_STRIDE];
+    }
+    /*
+       dc0 dc1
+       dc2 dc3
+       dc4 dc5
+       dc6 dc7
+    */
+    pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
+    pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
+    pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
+    pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
+    pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 );
+    pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 );
+    pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 );
+    pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 );
+
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc0;
+        MPIXEL_X4( src+4 ) = dc1;
+        src += FDEC_STRIDE;
+    }
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc2;
+        MPIXEL_X4( src+4 ) = dc3;
+        src += FDEC_STRIDE;
+    }
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc4;
+        MPIXEL_X4( src+4 ) = dc5;
+        src += FDEC_STRIDE;
+    }
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc6;
+        MPIXEL_X4( src+4 ) = dc7;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_h_c( pixel *src )
+{
+    for( int i = 0; i < 16; i++ )
+    {
+        pixel4 v = PIXEL_SPLAT_X4( src[-1] );
+        MPIXEL_X4( src+0 ) = v;
+        MPIXEL_X4( src+4 ) = v;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_v_c( pixel *src )
+{
+    pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
+    pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );
+
+    for( int i = 0; i < 16; i++ )
+    {
+        MPIXEL_X4( src+0 ) = v0;
+        MPIXEL_X4( src+4 ) = v1;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_p_c( pixel *src )
+{
+    int H = 0;
+    int V = 0;
+
+    for( int i = 0; i < 4; i++ )
+        H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );
+    for( int i = 0; i < 8; i++ )
+        V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
+
+    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
+    int b = ( 17 * H + 16 ) >> 5;
+    int c = ( 5 * V + 32 ) >> 6;
+    int i00 = a -3*b -7*c + 16;
+
+    for( int y = 0; y < 16; y++ )
+    {
+        int pix = i00;
+        for( int x = 0; x < 8; x++ )
+        {
+            src[x] = x264_clip_pixel( pix>>5 );
+            pix += b;
+        }
+        src += FDEC_STRIDE;
+        i00 += c;
+    }
+}
+
  /****************************************************************************
   * 4x4 prediction for intra luma block
   ****************************************************************************/
@@ -762,6 +924,17 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
  #endif
  }
  
+void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_c;
+    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_c;
+    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_c;
+    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_c;
+    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
+    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
+    pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;
+}
+
  void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
  {
      pf[I_PRED_8x8_V]      = x264_predict_8x8_v_c;
diff --git a/common/predict.h b/common/predict.h

index 23330f51744746c8d86bd9fdf4411f36100e30e5..8ceb5773596a622b1ba0746ebc22c53028d47284 100644 (file)
--- a/common/predict.h
+++ b/common/predict.h
@@ -42,7 +42,7 @@ enum intra_chroma_pred_e
      I_PRED_CHROMA_DC_TOP  = 5,
      I_PRED_CHROMA_DC_128  = 6
  };
-static const uint8_t x264_mb_pred_mode8x8c_fix[7] =
+static const uint8_t x264_mb_chroma_pred_mode_fix[7] =
  {
      I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
      I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
@@ -123,9 +123,14 @@ void x264_predict_8x8c_dc_c ( pixel *src );
  void x264_predict_8x8c_h_c  ( pixel *src );
  void x264_predict_8x8c_v_c  ( pixel *src );
  void x264_predict_8x8c_p_c  ( pixel *src );
+void x264_predict_8x16c_dc_c( pixel *src );
+void x264_predict_8x16c_h_c ( pixel *src );
+void x264_predict_8x16c_v_c ( pixel *src );
+void x264_predict_8x16c_p_c ( pixel *src );
  
  void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
  void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init ( int cpu, x264_predict_t pf[7] );
  void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
  void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
  
diff --git a/common/quant.c b/common/quant.c

index 5be7f57f68751b206f48f14fdd1038068c2ffd27..db9d57a8d8098291c51d57344b51d2baf4218ef7 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -6,6 +6,7 @@
   * Authors: Loren Merritt <lorenm@u.washington.edu>
   *          Fiona Glaser <fiona@x264.com>
   *          Christian Heine <sennindemokrit@gmx.net>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -141,54 +142,121 @@ static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
      }
  }
  
-static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf )
+#define IDCT_DEQUANT_2X4_START \
+    int a0 = dct[0] + dct[1]; \
+    int a1 = dct[2] + dct[3]; \
+    int a2 = dct[4] + dct[5]; \
+    int a3 = dct[6] + dct[7]; \
+    int a4 = dct[0] - dct[1]; \
+    int a5 = dct[2] - dct[3]; \
+    int a6 = dct[4] - dct[5]; \
+    int a7 = dct[6] - dct[7]; \
+    int b0 = a0 + a1; \
+    int b1 = a2 + a3; \
+    int b2 = a4 + a5; \
+    int b3 = a6 + a7; \
+    int b4 = a0 - a1; \
+    int b5 = a2 - a3; \
+    int b6 = a4 - a5; \
+    int b7 = a6 - a7;
+
+static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
+{
+    IDCT_DEQUANT_2X4_START
+    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+    dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
+    dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
+    dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
+    dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
+    dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
+    dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
+    dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
+    dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
+{
+    IDCT_DEQUANT_2X4_START
+    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+    dct[0] = ((b0 + b1) * dmf + 32) >> 6;
+    dct[1] = ((b2 + b3) * dmf + 32) >> 6;
+    dct[2] = ((b0 - b1) * dmf + 32) >> 6;
+    dct[3] = ((b2 - b3) * dmf + 32) >> 6;
+    dct[4] = ((b4 - b5) * dmf + 32) >> 6;
+    dct[5] = ((b6 - b7) * dmf + 32) >> 6;
+    dct[6] = ((b4 + b5) * dmf + 32) >> 6;
+    dct[7] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
+{
+    IDCT_DEQUANT_2X4_START
+    out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
+    out[1] = ((b2 + b3) * dmf + 2080) >> 6;
+    out[2] = ((b0 - b1) * dmf + 2080) >> 6;
+    out[3] = ((b2 - b3) * dmf + 2080) >> 6;
+    out[4] = ((b4 - b5) * dmf + 2080) >> 6;
+    out[5] = ((b6 - b7) * dmf + 2080) >> 6;
+    out[6] = ((b4 + b5) * dmf + 2080) >> 6;
+    out[7] = ((b6 + b7) * dmf + 2080) >> 6;
+}
+#undef IDCT_DEQUANT_2X4_START
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
  {
      int d0 = dct[0] + dct[1];
      int d1 = dct[2] + dct[3];
      int d2 = dct[0] - dct[1];
      int d3 = dct[2] - dct[3];
-    out[0] = (d0 + d1) * dequant_mf >> 5;
-    out[1] = (d0 - d1) * dequant_mf >> 5;
-    out[2] = (d2 + d3) * dequant_mf >> 5;
-    out[3] = (d2 - d3) * dequant_mf >> 5;
+    out[0] = ((d0 + d1) * dmf >> 5) + 32;
+    out[1] = ((d0 - d1) * dmf >> 5) + 32;
+    out[2] = ((d2 + d3) * dmf >> 5) + 32;
+    out[3] = ((d2 - d3) * dmf >> 5) + 32;
  }
  
-static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf )
+static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
  {
-    dctcoef out[4];
-    idct_dequant_2x2_dconly( out, dct, dequant_mf );
-    return ((ref[0] ^ (out[0]+32))
-          | (ref[1] ^ (out[1]+32))
-          | (ref[2] ^ (out[2]+32))
-          | (ref[3] ^ (out[3]+32))) >> 6;
+    dctcoef out[8];
+
+    if( chroma422 )
+        optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
+    else
+        optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
+
+    int sum = 0;
+    for( int i = 0; i < (chroma422?8:4); i++ )
+        sum |= ref[i] ^ out[i];
+    return sum >> 6;
  }
  
-static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
  {
      /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
-    dctcoef dct_orig[4];
+    dctcoef dct_orig[8];
      int coeff, nz;
  
-    idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf );
-    dct_orig[0] += 32;
-    dct_orig[1] += 32;
-    dct_orig[2] += 32;
-    dct_orig[3] += 32;
+    if( chroma422 )
+        optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
+    else
+        optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
  
      /* If the DC coefficients already round to zero, terminate early. */
-    if( !((dct_orig[0]|dct_orig[1]|dct_orig[2]|dct_orig[3])>>6) )
+    int sum = 0;
+    for( int i = 0; i < (chroma422?8:4); i++ )
+        sum |= dct_orig[i];
+    if( !(sum >> 6) )
          return 0;
  
      /* Start with the highest frequency coefficient... is this the best option? */
-    for( nz = 0, coeff = 3; coeff >= 0; coeff-- )
+    for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
      {
          int level = dct[coeff];
-        int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
+        int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
  
          while( level )
          {
              dct[coeff] = level - sign;
-            if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) )
+            if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
              {
                  nz = 1;
                  dct[coeff] = level;
@@ -201,6 +269,16 @@ static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
      return nz;
  }
  
+static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
+{
+    return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
+}
+
+static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
+{
+    return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
+}
+
  static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
  {
      for( int i = 0; i < size; i++ )
@@ -275,30 +353,20 @@ static int x264_decimate_score64( dctcoef *dct )
      return x264_decimate_score_internal( dct, 64 );
  }
  
-static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count )
-{
-    int i_last = i_count-1;
-    while( i_last >= 0 && l[i_last] == 0 )
-        i_last--;
-    return i_last;
+#define last(num)\
+static int x264_coeff_last##num( dctcoef *l )\
+{\
+    int i_last = num-1;\
+    while( i_last >= 0 && l[i_last] == 0 )\
+        i_last--;\
+    return i_last;\
  }
  
-static int x264_coeff_last4( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 4 );
-}
-static int x264_coeff_last15( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 15 );
-}
-static int x264_coeff_last16( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 16 );
-}
-static int x264_coeff_last64( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 64 );
-}
+last(4)
+last(8)
+last(15)
+last(16)
+last(64)
  
  #define level_run(num)\
  static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
@@ -317,10 +385,10 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )
  }
  
  level_run(4)
+level_run(8)
  level_run(15)
  level_run(16)
  
-
  void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  {
      pf->quant_8x8 = quant_8x8;
@@ -332,18 +400,24 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
      pf->dequant_4x4_dc = dequant_4x4_dc;
      pf->dequant_8x8 = dequant_8x8;
  
-    pf->optimize_chroma_dc = optimize_chroma_dc;
+    pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
+    pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
+
+    pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
+    pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
  
      pf->denoise_dct = x264_denoise_dct;
      pf->decimate_score15 = x264_decimate_score15;
      pf->decimate_score16 = x264_decimate_score16;
      pf->decimate_score64 = x264_decimate_score64;
  
-    pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4;
+    pf->coeff_last4 = x264_coeff_last4;
+    pf->coeff_last8 = x264_coeff_last8;
      pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
      pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
      pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
-    pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
+    pf->coeff_level_run4 = x264_coeff_level_run4;
+    pf->coeff_level_run8 = x264_coeff_level_run8;
      pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
      pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
  
@@ -361,16 +435,16 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
              pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
          }
          pf->decimate_score64 = x264_decimate_score64_mmx2;
-        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
+        pf->coeff_last4 = x264_coeff_last4_mmx2;
          pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
          pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
          pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
          pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
          pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
  #endif
-        pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
+        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
          if( cpu&X264_CPU_LZCNT )
-            pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
+            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
      }
      if( cpu&X264_CPU_SSE2 )
      {
@@ -397,7 +471,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
          if( cpu&X264_CPU_LZCNT )
          {
-            pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
+            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
              pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
              pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
              pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
@@ -471,12 +545,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
          pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
  #endif
-        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
-        pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
+        pf->coeff_last4 = x264_coeff_last4_mmx2;
+        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
          if( cpu&X264_CPU_LZCNT )
          {
-            pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
-            pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
+            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
          }
      }
  
@@ -493,7 +567,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
              pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
              pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
          }
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
          pf->denoise_dct = x264_denoise_dct_sse2;
          pf->decimate_score15 = x264_decimate_score15_sse2;
          pf->decimate_score16 = x264_decimate_score16_sse2;
@@ -524,7 +598,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
          pf->quant_4x4 = x264_quant_4x4_ssse3;
          pf->quant_8x8 = x264_quant_8x8_ssse3;
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
          pf->denoise_dct = x264_denoise_dct_ssse3;
          pf->decimate_score15 = x264_decimate_score15_ssse3;
          pf->decimate_score16 = x264_decimate_score16_ssse3;
@@ -541,7 +615,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
          pf->quant_4x4 = x264_quant_4x4_sse4;
          pf->quant_8x8 = x264_quant_8x8_sse4;
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
      }
  
      if( cpu&X264_CPU_AVX )
@@ -552,7 +626,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
              pf->dequant_4x4 = x264_dequant_4x4_avx;
              pf->dequant_8x8 = x264_dequant_8x8_avx;
          }
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
          pf->denoise_dct = x264_denoise_dct_avx;
      }
  #endif // HAVE_MMX
@@ -571,7 +645,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  
  #if HAVE_ARMV6
      if( cpu&X264_CPU_ARMV6 )
-        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm;
+        pf->coeff_last4 = x264_coeff_last4_arm;
  
      if( cpu&X264_CPU_NEON )
      {
diff --git a/common/quant.h b/common/quant.h

index 09364143612fe6e4e60c85bac396203588147351..9ad5385a5c5659ace33619c4d970fc1d1f181393 100644 (file)
--- a/common/quant.h
+++ b/common/quant.h
@@ -38,7 +38,11 @@ typedef struct
      void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
      void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
  
-    int (*optimize_chroma_dc)( dctcoef dct[4], int dequant_mf );
+    void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+    void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
+
+    int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf );
+    int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf );
  
      void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
  
@@ -46,7 +50,11 @@ typedef struct
      int (*decimate_score16)( dctcoef *dct );
      int (*decimate_score64)( dctcoef *dct );
      int (*coeff_last[14])( dctcoef *dct );
+    int (*coeff_last4)( dctcoef *dct );
+    int (*coeff_last8)( dctcoef *dct );
      int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
+    int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel );
+    int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel );
  } x264_quant_function_t;
  
  void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
diff --git a/common/set.h b/common/set.h

index 4bbfea6e39ca70d9695151081c44a3fa0fa1d539..038dbd4cf92aa257fa6545a6be5a62d2fdb73b77 100644 (file)
--- a/common/set.h
+++ b/common/set.h
@@ -35,10 +35,17 @@ enum profile_e
      PROFILE_HIGH    = 100,
      PROFILE_HIGH10  = 110,
      PROFILE_HIGH422 = 122,
-    PROFILE_HIGH444 = 144,
      PROFILE_HIGH444_PREDICTIVE = 244,
  };
  
+enum chroma_format_e
+{
+    CHROMA_400 = 0,
+    CHROMA_420 = 1,
+    CHROMA_422 = 2,
+    CHROMA_444 = 3,
+};
+
  enum cqm4_e
  {
      CQM_4IY = 0,
diff --git a/common/vlc.c b/common/vlc.c

index 1d002bbc95a9d472649b29f2f30ea4bb6a9687b5..bd2fc52c0ade7c467c10d3594017b1c72cfeb5e1 100644 (file)
--- a/common/vlc.c
+++ b/common/vlc.c
@@ -5,6 +5,7 @@
   *
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -26,16 +27,19 @@
  
  #include "common.h"
  
-const vlc_t x264_coeff0_token[5] =
+/* [nC] */
+const vlc_t x264_coeff0_token[6] =
  {
      { 0x1, 1 }, /* str=1 */
      { 0x3, 2 }, /* str=11 */
      { 0xf, 4 }, /* str=1111 */
      { 0x3, 6 }, /* str=000011 */
      { 0x1, 2 }, /* str=01 */
+    { 0x1, 1 }, /* str=1 */
  };
  
-const vlc_t x264_coeff_token[5][16][4] =
+/* [nC][i_total_coeff-1][i_trailing] */
+const vlc_t x264_coeff_token[6][16][4] =
  {
      { /* table 0 */
          { /* i_total 1 */
@@ -440,6 +444,53 @@ const vlc_t x264_coeff_token[5][16][4] =
              { 0x0, 7 }, /* str=0000000 */
          },
      },
+    { /* table 5 */
+        { /* i_total 1 */
+            { 0xf, 7 }, /* str=0001111 */
+            { 0x1, 2 }, /* str=01 */
+        },
+        { /* i_total 2 */
+            { 0xe, 7 }, /* str=0001110 */
+            { 0xd, 7 }, /* str=0001101 */
+            { 0x1, 3 }, /* str=001 */
+        },
+        { /* i_total 3 */
+            { 0x7, 9 }, /* str=000000111 */
+            { 0xc, 7 }, /* str=0001100 */
+            { 0xb, 7 }, /* str=0001011 */
+            { 0x1, 5 }, /* str=00001 */
+        },
+        { /* i_total 4 */
+            { 0x6, 9 }, /* str=000000110 */
+            { 0x5, 9 }, /* str=000000101 */
+            { 0xa, 7 }, /* str=0001010 */
+            { 0x1, 6 }, /* str=000001 */
+        },
+        { /* i_total 5 */
+            { 0x7, 10 }, /* str=0000000111 */
+            { 0x6, 10 }, /* str=0000000110 */
+            { 0x4, 9 },  /* str=000000100 */
+            { 0x9, 7 },  /* str=0001001 */
+        },
+        { /* i_total 6 */
+            { 0x7, 11 }, /* str=00000000111 */
+            { 0x6, 11 }, /* str=00000000110 */
+            { 0x5, 10 }, /* str=0000000101 */
+            { 0x8, 7 },  /* str=0001000 */
+        },
+        { /* i_total 7 */
+            { 0x7, 12 }, /* str=000000000111 */
+            { 0x6, 12 }, /* str=000000000110 */
+            { 0x5, 11 }, /* str=00000000101 */
+            { 0x4, 10 }, /* str=0000000100 */
+        },
+        { /* i_total 8 */
+            { 0x7, 13 }, /* str=0000000000111 */
+            { 0x5, 12 }, /* str=000000000101 */
+            { 0x4, 12 }, /* str=000000000100 */
+            { 0x4, 11 }, /* str=00000000100 */
+        },
+    },
  };
  
  /* [i_total_coeff-1][i_total_zeros] */
@@ -613,7 +664,7 @@ const vlc_t x264_total_zeros[15][16] =
  };
  
  /* [i_total_coeff-1][i_total_zeros] */
-const vlc_t x264_total_zeros_dc[3][4] =
+const vlc_t x264_total_zeros_2x2_dc[3][4] =
  {
      { /* i_total 1 */
          { 0x1, 1 }, /* str=1 */
@@ -632,7 +683,61 @@ const vlc_t x264_total_zeros_dc[3][4] =
      },
  };
  
-/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
+/* [i_total_coeff-1][i_total_zeros] */
+const vlc_t x264_total_zeros_2x4_dc[7][8] =
+{
+    { /* i_total 1 */
+        { 0x1, 1 }, /* str=1 */
+        { 0x2, 3 }, /* str=010 */
+        { 0x3, 3 }, /* str=011 */
+        { 0x2, 4 }, /* str=0010 */
+        { 0x3, 4 }, /* str=0011 */
+        { 0x1, 4 }, /* str=0001 */
+        { 0x1, 5 }, /* str=00001 */
+        { 0x0, 5 }, /* str=00000 */
+    },
+    { /* i_total 2 */
+        { 0x0, 3 }, /* str=000 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x1, 3 }, /* str=001 */
+        { 0x4, 3 }, /* str=100 */
+        { 0x5, 3 }, /* str=101 */
+        { 0x6, 3 }, /* str=110 */
+        { 0x7, 3 }, /* str=111 */
+    },
+    { /* i_total 3 */
+        { 0x0, 3 }, /* str=000 */
+        { 0x1, 3 }, /* str=001 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x2, 2 }, /* str=10 */
+        { 0x6, 3 }, /* str=110 */
+        { 0x7, 3 }, /* str=111 */
+    },
+    { /* i_total 4 */
+        { 0x6, 3 }, /* str=110 */
+        { 0x0, 2 }, /* str=00 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x2, 2 }, /* str=10 */
+        { 0x7, 3 }, /* str=111 */
+    },
+    { /* i_total 5 */
+        { 0x0, 2 }, /* str=00 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x2, 2 }, /* str=10 */
+        { 0x3, 2 }, /* str=11 */
+    },
+    { /* i_total 6 */
+        { 0x0, 2 }, /* str=00 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x1, 1 }, /* str=1 */
+    },
+    { /* i_total 7 */
+        { 0x0, 1 }, /* str=0 */
+        { 0x1, 1 }, /* str=1 */
+    }
+};
+
+/* [MIN( i_zero_left-1, 6 )][run_before] */
  const vlc_t x264_run_before[7][16] =
  {
      { /* i_zero_left 1 */
@@ -674,7 +779,7 @@ const vlc_t x264_run_before[7][16] =
          { 0x5, 3 }, /* str=101 */
          { 0x4, 3 }, /* str=100 */
      },
-    { /* i_zero_left 7 */
+    { /* i_zero_left >6 */
          { 0x7, 3 }, /* str=111 */
          { 0x6, 3 }, /* str=110 */
          { 0x5, 3 }, /* str=101 */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 24a5c3fa8cfa6a8337db374884c24dd96bbba487..f5c0d797060ab04528f05c00b59f7977da26f08f 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1015,10 +1015,9 @@ cglobal plane_copy_interleave_core, 7,7
      RET
  
  ;-----------------------------------------------------------------------------
-; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
+; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
  ;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2, 4,5
-    mov    r4d, 4
+cglobal store_interleave_chroma, 5,5
      FIX_STRIDES r1d
  .loop:
      INTERLEAVE r0+ 0, r2+           0, r3+           0, a
@@ -1026,7 +1025,7 @@ cglobal store_interleave_8x8x2, 4,5
      add    r2, FDEC_STRIDEB*2
      add    r3, FDEC_STRIDEB*2
      lea    r0, [r0+r1*2]
-    dec    r4d
+    sub   r4d, 2
      jg .loop
      REP_RET
  %endmacro ; PLANE_INTERLEAVE
@@ -1076,34 +1075,32 @@ cglobal plane_copy_deinterleave, 6,7
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
  ;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fenc, 3,4
+cglobal load_deinterleave_chroma_fenc, 4,4
      DEINTERLEAVE_START
-    mov    r3d, 4
      FIX_STRIDES r2d
  .loop:
      DEINTERLEAVE r0+           0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
      DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
      add    r0, FENC_STRIDEB*2
      lea    r1, [r1+r2*2]
-    dec    r3d
+    sub   r3d, 2
      jg .loop
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
  ;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fdec, 3,4
+cglobal load_deinterleave_chroma_fdec, 4,4
      DEINTERLEAVE_START
-    mov    r3d, 4
      FIX_STRIDES r2d
  .loop:
      DEINTERLEAVE r0+           0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
      DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
      add    r0, FDEC_STRIDEB*2
      lea    r1, [r1+r2*2]
-    dec    r3d
+    sub   r3d, 2
      jg .loop
      REP_RET
  %endmacro ; PLANE_DEINTERLEAVE
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 52e62d6ec2173c6a5ddde4a60bb09d2b6a58e60d..6a730475771be461de4fd741da8707f58620d392 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -113,17 +113,17 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
  void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
                                           uint16_t *dstv, int i_dstv,
                                           uint16_t *src, int i_src, int w, int h );
-void x264_store_interleave_8x8x2_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_sse2( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_avx( uint16_t *dst, uint16_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_mmx( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_sse2( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_avx( uint16_t *dst, uint16_t *src, int i_src );
+void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
  void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
  void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
  void x264_memzero_aligned_mmx( void * dst, int n );
@@ -497,8 +497,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_MMX) )
          return;
  
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
  
      pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
  
@@ -519,7 +519,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  
      pf->plane_copy = x264_plane_copy_mmx2;
      pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
-    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
+    pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
  
      pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
      pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmx2;
@@ -552,8 +552,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  
      pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
  
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
  
      pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
      pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
@@ -570,7 +570,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->integral_init4v = x264_integral_init4v_sse2;
      pf->integral_init8v = x264_integral_init8v_sse2;
      pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
-    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
+    pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
      pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
      pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
  
@@ -603,11 +603,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_AVX) )
          return;
  
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_avx;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_avx;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
      pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
      pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
-    pf->store_interleave_8x8x2       = x264_store_interleave_8x8x2_avx;
+    pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
  
      if( !(cpu&X264_CPU_STACK_MOD4) )
          pf->mc_chroma = x264_mc_chroma_avx;
@@ -663,9 +663,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  
      if( cpu&X264_CPU_SSE2_IS_FAST )
      {
-        pf->store_interleave_8x8x2  = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium?
-        pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
-        pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+        pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
          pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
          pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
          pf->mc_luma = mc_luma_sse2;
@@ -695,8 +695,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
      pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
  
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
      pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
  
      pf->hpel_filter = x264_hpel_filter_ssse3;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index 215a7170ac6971226fa2cfc51100547029d325fc..40f9ed5889dfa1c92fa6135049aaac6328289eba 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -670,10 +670,10 @@ DEQUANT_DC w, pmullw
  %endif
  
  ;-----------------------------------------------------------------------------
-; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
  ;-----------------------------------------------------------------------------
  
-%macro OPTIMIZE_CHROMA_DC 0
+%macro OPTIMIZE_CHROMA_2x2_DC 0
  %assign %%regs 5
  %if cpuflag(sse4)
      %assign %%regs %%regs-1
@@ -681,7 +681,7 @@ DEQUANT_DC w, pmullw
  %ifndef ARCH_X86_64
      %assign %%regs %%regs+1      ; t0-t4 are volatile on x86-64
  %endif
-cglobal optimize_chroma_dc, 0,%%regs,7
+cglobal optimize_chroma_2x2_dc, 0,%%regs,7
      movifnidn t0, r0mp
      movd      m2, r1m
      movq      m1, [t0]
@@ -775,13 +775,13 @@ cglobal optimize_chroma_dc, 0,%%regs,7
  
  %ifndef HIGH_BIT_DEPTH
  INIT_XMM sse2
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
  INIT_XMM ssse3
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
  INIT_XMM sse4
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
  INIT_XMM avx
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
  %endif ; !HIGH_BIT_DEPTH
  
  %ifdef HIGH_BIT_DEPTH
diff --git a/common/x86/quant.h b/common/x86/quant.h

index 4abaea09d6b3e9f8d6a735195d093867ef0e0366..8b604720737b834b6e963f02b756cb6f85c6105c 100644 (file)
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -57,10 +57,10 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_
  void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
  void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
  void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
  void x264_denoise_dct_mmx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
  void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
  void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 69de51740e5e91193f0054d4e8b8c871884cc01f..b5b5a78d0929e9b2ef7eb757de6d1b2b168c4631 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -103,8 +103,8 @@ typedef struct
      int i_satd_pcm;
  
      /* Chroma part */
-    int i_satd_i8x8chroma;
-    int i_satd_i8x8chroma_dir[7];
+    int i_satd_chroma;
+    int i_satd_chroma_dir[7];
      int i_predict8x8chroma;
  
      /* II: Inter part P/B frame */
@@ -431,7 +431,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
      a->i_satd_i16x16 =
      a->i_satd_i8x8   =
      a->i_satd_i4x4   =
-    a->i_satd_i8x8chroma = COST_MAX;
+    a->i_satd_chroma = COST_MAX;
  
      /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
      a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
@@ -607,7 +607,7 @@ static const int8_t i16x16_mode_available[5][5] =
      {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
  };
  
-static const int8_t i8x8chroma_mode_available[5][5] =
+static const int8_t chroma_mode_available[5][5] =
  {
      {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
      {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
@@ -641,11 +641,11 @@ static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour
      return i16x16_mode_available[idx];
  }
  
-static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
+static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
  {
      int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
      idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
-    return i8x8chroma_mode_available[idx];
+    return chroma_mode_available[idx];
  }
  
  static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
@@ -690,45 +690,46 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
  
  static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
  {
-    if( a->i_satd_i8x8chroma < COST_MAX )
+    if( a->i_satd_chroma < COST_MAX )
          return;
  
      if( CHROMA444 )
      {
          if( !h->mb.b_chroma_me )
          {
-            a->i_satd_i8x8chroma = 0;
+            a->i_satd_chroma = 0;
              return;
          }
  
          /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
          h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
          h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
-        a->i_satd_i8x8chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
-                             + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+        a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
+                         + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
          return;
      }
  
-    const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+    const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
+    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
  
-    /* 8x8 prediction selection for chroma */
+    /* Prediction selection for chroma */
      if( predict_mode[3] >= 0 && !h->mb.b_lossless )
      {
          int satdu[4], satdv[4];
-        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
-        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
-        h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
-        h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
-        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
-        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
+        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
+        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
+        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
+        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
+        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
  
          for( ; *predict_mode >= 0; predict_mode++ )
          {
              int i_mode = *predict_mode;
              int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
  
-            a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
-            COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+            a->i_satd_chroma_dir[i_mode] = i_satd;
+            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
          }
      }
      else
@@ -740,20 +741,20 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
  
              /* we do the prediction */
              if( h->mb.b_lossless )
-                x264_predict_lossless_8x8_chroma( h, i_mode );
+                x264_predict_lossless_chroma( h, i_mode );
              else
              {
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
              }
  
              /* we calculate the cost */
-            i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
-                     h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
-                     a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
+            i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
+                     h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
+                     a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
  
-            a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
-            COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+            a->i_satd_chroma_dir[i_mode] = i_satd;
+            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
          }
      }
  
@@ -1110,17 +1111,17 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
      /* RD selection for chroma prediction */
      if( !CHROMA444 )
      {
-        const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+        const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
          if( predict_mode[1] >= 0 )
          {
              int8_t predict_mode_sorted[4];
              int i_max;
-            int i_thresh = a->b_early_terminate ? a->i_satd_i8x8chroma * 5/4 : COST_MAX;
+            int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
  
              for( i_max = 0; *predict_mode >= 0; predict_mode++ )
              {
                  int i_mode = *predict_mode;
-                if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
+                if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
                      predict_mode_sorted[i_max++] = i_mode;
              }
  
@@ -1131,21 +1132,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                  /* the previous thing encoded was x264_intra_rd(), so the pixels and
                   * coefs for the current chroma mode are still around, so we only
                   * have to recount the bits. */
-                i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
+                i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
                  for( int i = 0; i < i_max; i++ )
                  {
                      int i_mode = predict_mode_sorted[i];
                      if( h->mb.b_lossless )
-                        x264_predict_lossless_8x8_chroma( h, i_mode );
+                        x264_predict_lossless_chroma( h, i_mode );
                      else
                      {
-                        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
                      }
                      /* if we've already found a mode that needs no residual, then
                       * probably any mode with a residual will be worse.
                       * so avoid dct on the remaining modes to improve speed. */
-                    i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
+                    i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
                      COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
                  }
                  h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
@@ -1273,14 +1274,13 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  
  #define LOAD_FENC(m, src, xoff, yoff) \
  { \
-    int s = !CHROMA444; \
      (m)->p_cost_mv = a->p_cost_mv; \
      (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
      (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
      (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
      (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
-    (m)->p_fenc[1] = &(src)[1][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
-    (m)->p_fenc[2] = &(src)[2][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
+    (m)->p_fenc[1] = &(src)[1][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
+    (m)->p_fenc[2] = &(src)[2][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
  }
  
  #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
@@ -1301,7 +1301,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
          (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
      } \
      else \
-        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
+        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>h->mb.chroma_v_shift)*(m)->i_stride[1]]; \
      (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->weight = x264_weight_none; \
      (m)->i_ref = ref; \
@@ -1672,19 +1672,22 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
      a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
  }
  
-static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
+static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
+                                                                     pixel **p_fref, int i8x8, int size, int chroma )
  {
-    ALIGNED_ARRAY_16( pixel, pix1,[16*8] );
+    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
      pixel *pix2 = pix1+8;
-    const int i_stride = h->mb.pic.i_stride[1];
-    const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride;
-    const int i_ref = a->l0.me8x8[i8x8].i_ref;
-    const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    int i_stride = h->mb.pic.i_stride[1];
+    int chroma_h_shift = chroma <= CHROMA_422;
+    int chroma_v_shift = chroma == CHROMA_420;
+    int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
+    int i_ref = a->l0.me8x8[i8x8].i_ref;
+    int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
      x264_weight_t *weight = h->sh.weight[i_ref];
  
      // FIXME weight can be done on 4x4 blocks even if mc is smaller
  #define CHROMA4x4MC( width, height, me, x, y ) \
-    if( CHROMA444 ) \
+    if( chroma == CHROMA_444 ) \
      { \
          int mvx = (me).mv[0] + 4*2*x; \
          int mvy = (me).mv[1] + 4*2*y; \
@@ -1695,14 +1698,16 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
      } \
      else \
      { \
-        h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+        int offset = x + (2>>chroma_v_shift)*16*y; \
+        int chroma_height = (2>>chroma_v_shift)*height; \
+        h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
+                         (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
          if( weight[1].weightfn ) \
-            weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+            weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
          if( weight[2].weightfn ) \
-            weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); \
+            weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
      }
  
-
      if( size == PIXEL_4x4 )
      {
          x264_me_t *m = a->l0.me4x4[i8x8];
@@ -1723,13 +1728,24 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
          CHROMA4x4MC( 2,4, m[0], 0,0 );
          CHROMA4x4MC( 2,4, m[1], 2,0 );
      }
+#undef CHROMA4x4MC
  
-    int oe = (8*(i8x8&1) + 4*(i8x8&2)*FENC_STRIDE) >> !CHROMA444;
-    int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+    int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
+    int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
      return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
           + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
  }
  
+static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
+{
+    if( CHROMA_FORMAT == CHROMA_444 )
+        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
+    else if( CHROMA_FORMAT == CHROMA_422 )
+        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
+    else
+        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
+}
+
  static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  {
      pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
@@ -1845,47 +1861,46 @@ static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *
  {
      ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
      ALIGNED_ARRAY_16( pixel,  bi, [2],[16*16] );
-    int l0_mvy_offset, l1_mvy_offset;
      int i_chroma_cost = 0;
+    int chromapix = h->luma2chroma_pixel[i_pixel];
  
  #define COST_BI_CHROMA( m0, m1, width, height ) \
  { \
      if( CHROMA444 ) \
      { \
          h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
-                       m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \
+                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
          h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
-                       m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \
+                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
          h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
-                       m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \
+                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
          h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
-                       m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \
-        h->mc.avg[i_pixel]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        h->mc.avg[i_pixel]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        i_chroma_cost  = h->pixf.mbcmp[i_pixel]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
-        i_chroma_cost += h->pixf.mbcmp[i_pixel]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
      } \
      else \
      { \
-        l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-        l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
-        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
-        h->mc.avg[i_pixel+3]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        h->mc.avg[i_pixel+3]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        i_chroma_cost  = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
-        i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+        int v_shift = h->mb.chroma_v_shift; \
+        int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+        int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
+                         m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
+        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
+                         m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
      } \
+    h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+    h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+    i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
+                  + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
  }
  
      if( i_pixel == PIXEL_16x16 )
-        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 8, 8 )
+        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
      else if( i_pixel == PIXEL_16x8 )
-        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 8, 4 )
+        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
      else if( i_pixel == PIXEL_8x16 )
-        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 4, 8 )
+        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
      else
-        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 4, 4 )
+        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
  
      return i_chroma_cost;
  }
@@ -1897,12 +1912,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
  
      pixel *p_fenc = h->mb.pic.p_fenc[0];
      pixel *p_fdec = h->mb.pic.p_fdec[0];
-    int s = !CHROMA444;
  
      a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
      if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
      {
-        int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+        int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
+
          for( int i = 0; i < 4; i++ )
          {
              const int x = (i&1)*8;
@@ -1911,10 +1926,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
                                                                &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
              if( h->mb.b_chroma_me )
              {
-                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[1][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE )
-                                      +  h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[2][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE );
+                int fenc_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FENC_STRIDE;
+                int fdec_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FDEC_STRIDE;
+                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
+                                       + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
              }
              a->i_cost16x16direct += a->i_cost8x8direct[i];
  
@@ -1924,10 +1941,10 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
      }
      else
      {
-        int chromapix = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
          a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
          if( h->mb.b_chroma_me )
          {
+            int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
              a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
                                   +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
          }
@@ -2055,7 +2072,6 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  
          if( h->mb.b_chroma_me )
          {
-            ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] );
              ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
  
              if( CHROMA444 )
@@ -2071,31 +2087,37 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
              }
              else
              {
-                if( MB_INTERLACED & a->l0.bi16x16.i_ref )
+                ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
+                int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+                int v_shift = h->mb.chroma_v_shift;
+
+                if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
                  {
-                    int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                    int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
                      h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
                                       h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
                  }
                  else
-                    h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+                    h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
+                                                         h->mb.pic.i_stride[1], 16>>v_shift );
  
-                if( MB_INTERLACED & a->l1.bi16x16.i_ref )
+                if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
                  {
-                    int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                    int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
                      h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
                                       h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
                  }
                  else
-                    h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+                    h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
+                                                         h->mb.pic.i_stride[1], 16>>v_shift );
  
-                h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
+                h->mc.avg[chromapix]( bi,   FENC_STRIDE, pixuv[0],   FENC_STRIDE, pixuv[1],   FENC_STRIDE,
                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-                h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
+                h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  
-                cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
-                       +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
+                cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi,   FENC_STRIDE )
+                       +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
              }
          }
  
@@ -3172,11 +3194,11 @@ intra_analysis:
                  else
                  {
                      x264_mb_analyse_intra_chroma( h, &analysis );
-                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
+                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
                  }
-                analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
+                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
+                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
              }
              else
                  x264_mb_analyse_intra( h, &analysis, i_cost );
@@ -3219,8 +3241,9 @@ intra_analysis:
                      h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
                  if( !CHROMA444 )
                  {
-                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
-                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+                    int height = 16 >> h->mb.chroma_v_shift;
+                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
+                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
                  }
                  x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
                  goto intra_analysis;
@@ -3583,11 +3606,11 @@ intra_analysis:
                  else
                  {
                      x264_mb_analyse_intra_chroma( h, &analysis );
-                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
+                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
                  }
-                analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
+                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
+                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
              }
              else
                  x264_mb_analyse_intra( h, &analysis, i_satd_inter );
diff --git a/encoder/cabac.c b/encoder/cabac.c

index 491b4ee7382fab3f447c977d5f5ea6f3a0847b07..c575724e06cac8a21040599c7e80fa2bd7475f97 100644 (file)
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -210,8 +210,8 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int
  
  static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
  {
-    const int i_mode = x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode];
-    int       ctx = 0;
+    int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
+    int ctx = 0;
  
      /* No need to test for I4x4 or I_16x16 as cache_save handle that */
      if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 )
@@ -485,7 +485,7 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
   *                1-> AC 16x16  i_idx = luma4x4idx
   *                2-> Luma4x4   i_idx = luma4x4idx
   *                3-> DC Chroma i_idx = iCbCr
- *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+ *                4-> AC Chroma i_idx = numChroma4x4Blks * iCbCr + chroma4x4idx
   *                5-> Luma8x8   i_idx = luma8x8idx
   */
  
@@ -567,6 +567,7 @@ static const uint8_t last_coeff_flag_offset_8x8[63] =
      3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
      5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
  };
+static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
  
  // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
  //           4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
@@ -574,6 +575,9 @@ static const uint8_t last_coeff_flag_offset_8x8[63] =
  static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
  /* map node ctx => cabac ctx for level>1 */
  static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
+ * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
+static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
  static const uint8_t coeff_abs_level_transition[2][8] = {
  /* update node ctx after coding a level=1 */
      { 1, 2, 3, 3, 4, 5, 6, 7 },
@@ -583,18 +587,17 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
  static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
  
  #if !RDO_SKIP_BS
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+static ALWAYS_INLINE void block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
  {
-    const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
      int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
      int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
      int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
-    int coeff_idx = -1, node_ctx = 0, last;
-    int coeffs[64];
-
-    last = h->quantf.coeff_last[ctx_block_cat]( l );
+    int coeff_idx = -1, node_ctx = 0;
+    int last = h->quantf.coeff_last[ctx_block_cat]( l );
+    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
+    dctcoef coeffs[64];
  
-#define WRITE_SIGMAP( l8x8 )\
+#define WRITE_SIGMAP( sig_off, last_off )\
  {\
      int i = 0;\
      while( 1 )\
@@ -602,19 +605,18 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
          if( l[i] )\
          {\
              coeffs[++coeff_idx] = l[i];\
-            x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\
+            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\
              if( i == last )\
              {\
-                x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\
+                x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\
                  break;\
              }\
              else\
-                x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\
+                x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\
          }\
          else\
-            x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\
-        i++;\
-        if( i == count_m1 )\
+            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\
+        if( ++i == count_m1 )\
          {\
              coeffs[++coeff_idx] = l[i];\
              break;\
@@ -622,11 +624,22 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
      }\
  }
  
-    int count_m1 = count_cat_m1[ctx_block_cat];
-    if( count_m1 == 63 )
-        WRITE_SIGMAP( 1 )
+    if( chroma422dc )
+    {
+        int count_m1 = 7;
+        WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] )
+    }
      else
-        WRITE_SIGMAP( 0 )
+    {
+        int count_m1 = count_cat_m1[ctx_block_cat];
+        if( count_m1 == 63 )
+        {
+            const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
+            WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] )
+        }
+        else
+            WRITE_SIGMAP( i, i )
+    }
  
      do
      {
@@ -639,7 +652,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
          if( abs_coeff > 1 )
          {
              x264_cabac_encode_decision( cb, ctx, 1 );
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level;
+            ctx = levelgt1_ctx[node_ctx] + ctx_level;
              for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- )
                  x264_cabac_encode_decision( cb, ctx, 1 );
              if( abs_coeff < 15 )
@@ -658,15 +671,23 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
          x264_cabac_encode_bypass( cb, coeff_sign );
      } while( --coeff_idx >= 0 );
  }
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 );
+}
+static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    /* Template a version specifically for chroma 4:2:2 DC in order to avoid
+     * slowing down everything else due to the added complexity. */
+    block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 1 );
+}
  #define block_residual_write_cabac_8x8( h, cb, cat, l ) block_residual_write_cabac( h, cb, cat, l )
-
  #else
  
-/* Faster RDO by merging sigmap and level coding.  Note that for 8x8dct
- * this is slightly incorrect because the sigmap is not reversible
- * (contexts are repeated).  However, there is nearly no quality penalty
- * for this (~0.001db) and the speed boost (~30%) is worth it. */
-static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8 )
+/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is
+ * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there
+ * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
  {
      const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
      int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
@@ -676,17 +697,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
      int coeff_abs = abs(l[last]);
      int ctx = coeff_abs_level1_ctx[0] + ctx_level;
      int node_ctx;
+    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
  
-    if( last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) )
+    if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
      {
-        x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : last), 1 );
-        x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : last), 1 );
+        x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
+                                    chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
+        x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] :
+                                    chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
      }
  
      if( coeff_abs > 1 )
      {
          x264_cabac_encode_decision( cb, ctx, 1 );
-        ctx = coeff_abs_levelgt1_ctx[0] + ctx_level;
+        ctx = levelgt1_ctx[0] + ctx_level;
          if( coeff_abs < 15 )
          {
              cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
@@ -712,14 +736,16 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
          if( l[i] )
          {
              coeff_abs = abs(l[i]);
-            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 1 );
-            x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );
+            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
+                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 );
+            x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] :
+                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
              ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
  
              if( coeff_abs > 1 )
              {
                  x264_cabac_encode_decision( cb, ctx, 1 );
-                ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level;
+                ctx = levelgt1_ctx[node_ctx] + ctx_level;
                  if( coeff_abs < 15 )
                  {
                      cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
@@ -741,45 +767,49 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
              }
          }
          else
-            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 0 );
+            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
+                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
      }
  }
  
  static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
  {
-    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1 );
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1, 0 );
+}
+static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
  }
  static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
  {
-    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 );
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0, 0 );
  }
  #endif
  
-#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+#define block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, name )\
  do\
  {\
      int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
      if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
      {\
          x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
-        block_residual_write_cabac( h, cb, ctx_block_cat, l );\
+        block_residual_write_cabac##name( h, cb, ctx_block_cat, l );\
      }\
      else\
          x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
  } while(0)
  
+#define block_residual_write_cabac_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+    block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, )
+
+#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+    block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, )
+
  #define block_residual_write_cabac_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
-do\
-{\
-    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
-    if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
-    {\
-        x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
-        block_residual_write_cabac_8x8( h, cb, ctx_block_cat, l );\
-    }\
-    else\
-        x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
-} while(0)
+    block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, _8x8 )
+
+#define block_residual_write_cabac_422_dc_cbf( h, cb, ch, b_intra )\
+    block_residual_write_cabac_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, _422_dc )
  
  static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma )
  {
@@ -808,7 +838,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
                  bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
          if( chroma )
              for( int ch = 1; ch < 3; ch++ )
-                for( int i = 0; i < 8; i++ )
+                for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
                      for( int j = 0; j < 8; j++ )
                          bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
  
@@ -968,7 +998,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
          x264_cabac_mb_transform_size( h, cb );
      }
  
-    if( h->mb.i_cbp_luma > 0 || (chroma && h->mb.i_cbp_chroma > 0) || i_mb_type == I_16x16 )
+    if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 )
      {
          const int b_intra = IS_INTRA( i_mb_type );
          x264_cabac_mb_qp_delta( h, cb );
@@ -979,7 +1009,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
              /* DC Luma */
              for( int p = 0; p < plane_count; p++ )
              {
-                block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
+                block_residual_write_cabac_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
  
                  /* AC Luma */
                  if( h->mb.i_cbp_luma )
@@ -1054,12 +1084,24 @@ if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy]
  
          if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
          {
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
-            if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-                for( int ch = 1; ch < 3; ch++ )
-                    for( int i = ch*16; i < ch*16+4; i++ )
-                        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                block_residual_write_cabac_422_dc_cbf( h, cb, 0, b_intra );
+                block_residual_write_cabac_422_dc_cbf( h, cb, 1, b_intra );
+            }
+            else
+            {
+                block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
+                block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
+            }
+
+            if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+            {
+                int step = 8 << h->mb.chroma_v_shift;
+                for( int i = 16; i < 3*16; i += step )
+                    for( int j = i; j < i+4; j++ )
+                        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra );
+            }
          }
      }
  
@@ -1130,8 +1172,19 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
  
          if( h->mb.i_cbp_chroma )
          {
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                int offset = (5*i8) & 0x09;
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 );
+            }
+            else
+            {
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+            }
          }
  
          i8 += x264_pixel_size[i_pixel].h >> 3;
@@ -1180,19 +1233,30 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
          block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 );
  }
  
-static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
+static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
  {
      x264_cabac_mb_intra_chroma_pred_mode( h, cb );
      x264_cabac_mb_cbp_chroma( h, cb );
-    if( h->mb.i_cbp_chroma > 0 )
+    if( h->mb.i_cbp_chroma )
      {
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
+        if( CHROMA_FORMAT == CHROMA_422 )
+        {
+            block_residual_write_cabac_422_dc_cbf( h, cb, 0, 1 );
+            block_residual_write_cabac_422_dc_cbf( h, cb, 1, 1 );
+        }
+        else
+        {
+            block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
+            block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
+        }
  
          if( h->mb.i_cbp_chroma == 2 )
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = ch*16; i < ch*16+4; i++ )
-                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 );
+        {
+            int step = 8 << h->mb.chroma_v_shift;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 );
+        }
      }
  }
  #endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index dcf4e9b4d7cc482d927583d62a3dd0c157929e14..07397e0ab27c9fc2def9ad991d3c3ddeb60b5176 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -122,10 +122,9 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc
  {
      bs_t *s = &h->out.bs;
      static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
-    static const uint8_t count_cat[14] = {16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
+    static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
      x264_run_level_t runlevel;
-    int i_trailing, i_total_zero, i_suffix_length;
-    int i_total = 0;
+    int i_total, i_trailing, i_total_zero, i_suffix_length;
      unsigned int i_sign;
  
      /* level and run and total */
@@ -177,13 +176,17 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc
          }
      }
  
-    if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+    if( ctx_block_cat == DCT_CHROMA_DC )
      {
-        if( ctx_block_cat == DCT_CHROMA_DC )
-            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
-        else
-            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        if( i_total < 8>>h->mb.chroma_v_shift )
+        {
+            vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
+                                                            : x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
+            bs_write_vlc( s, total_zeros );
+        }
      }
+    else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+        bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
  
      for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
      {
@@ -199,7 +202,8 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
  
  #define block_residual_write_cavlc(h,cat,idx,l)\
  {\
-    int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
+    int nC = cat == DCT_CHROMA_DC ? 3 + CHROMA_FORMAT\
+                                  : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
      uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
      if( !*nnz )\
          bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
@@ -323,7 +327,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
                  bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
          if( chroma )
              for( int ch = 1; ch < 3; ch++ )
-                for( int i = 0; i < 8; i++ )
+                for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
                      for( int j = 0; j < 8; j++ )
                          bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
  
@@ -358,14 +362,14 @@ void x264_macroblock_write_cavlc( x264_t *h )
                  bs_write( s, 4, i_mode - (i_mode > i_pred) );
          }
          if( chroma )
-            bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+            bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
      }
      else if( i_mb_type == I_16x16 )
      {
          bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
                          h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
          if( chroma )
-            bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+            bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
      }
      else if( i_mb_type == P_L0 )
      {
@@ -539,10 +543,13 @@ void x264_macroblock_write_cavlc( x264_t *h )
          /* Chroma DC residual present */
          block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
          block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
-        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = ch*16; i < ch*16+4; i++ )
-                    block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+        if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+        {
+            int step = 8 << h->mb.chroma_v_shift;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+        }
      }
  
  #if !RDO_SKIP_BS
@@ -592,8 +599,19 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
              x264_macroblock_luma_write_cavlc( h, p*4+i8, p*4+i8 );
          if( h->mb.i_cbp_chroma )
          {
-            block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
-            block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                int offset = (5*i8) & 0x09;
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
+            }
+            else
+            {
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+            }
          }
          i8 += x264_pixel_size[i_pixel].h >> 3;
      }
@@ -644,18 +662,21 @@ static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
      return h->out.bs.i_bits_encoded;
  }
  
-static int x264_i8x8_chroma_size_cavlc( x264_t *h )
+static int x264_chroma_size_cavlc( x264_t *h )
  {
-    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
      if( h->mb.i_cbp_chroma )
      {
          block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
          block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
  
          if( h->mb.i_cbp_chroma == 2 )
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = ch*16; i < ch*16+4; i++ )
-                    block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+        {
+            int step = 8 << h->mb.chroma_v_shift;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+        }
      }
      return h->out.bs.i_bits_encoded;
  }
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 987b39a472b4c8f5179aa0df19df8f165ac1bd61..4c47a9980431790fe1d2aed790804f407c5a0629 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -71,7 +71,7 @@ static void x264_frame_dump( x264_t *h )
          return;
  
      /* Write the frame in display order */
-    int frame_size = h->param.i_height * h->param.i_width * (3<<CHROMA444)/2 * sizeof(pixel);
+    int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
      fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
      for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
          for( int y = 0; y < h->param.i_height; y++ )
@@ -79,7 +79,7 @@ static void x264_frame_dump( x264_t *h )
      if( !CHROMA444 )
      {
          int cw = h->param.i_width>>1;
-        int ch = h->param.i_height>>1;
+        int ch = h->param.i_height>>h->mb.chroma_v_shift;
          pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
          pixel *planev = planeu + cw*ch + 16;
          h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
@@ -90,7 +90,6 @@ static void x264_frame_dump( x264_t *h )
      fclose( f );
  }
  
-
  /* Fill "default" values */
  static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
                                      x264_sps_t *sps, x264_pps_t *pps,
@@ -400,6 +399,17 @@ static int x264_validate_parameters( x264_t *h, int b_open )
          return -1;
      }
  #endif
+
+#if HAVE_INTERLACED
+    h->param.b_interlaced = !!PARAM_INTERLACED;
+#else
+    if( h->param.b_interlaced )
+    {
+        x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
+        return -1;
+    }
+#endif
+
      if( h->param.i_width <= 0 || h->param.i_height <= 0 )
      {
          x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
@@ -410,26 +420,30 @@ static int x264_validate_parameters( x264_t *h, int b_open )
      int i_csp = h->param.i_csp & X264_CSP_MASK;
      if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
      {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I444/YV24/BGR/BGRA/RGB supported)\n" );
+        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
          return -1;
      }
  
-    if( i_csp < X264_CSP_I444 && (h->param.i_width % 2 || h->param.i_height % 2) )
+    if( i_csp < X264_CSP_I444 && h->param.i_width % 2 )
      {
-        x264_log( h, X264_LOG_ERROR, "width or height not divisible by 2 (%dx%d)\n",
+        x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n",
                    h->param.i_width, h->param.i_height );
          return -1;
      }
  
-#if HAVE_INTERLACED
-    h->param.b_interlaced = !!PARAM_INTERLACED;
-#else
-    if( h->param.b_interlaced )
+    if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 )
      {
-        x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
+        x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n",
+                  h->param.i_width, h->param.i_height );
+        return -1;
+    }
+
+    if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 )
+    {
+        x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n",
+                  h->param.i_width, h->param.i_height );
          return -1;
      }
-#endif
  
      if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width ||
          (h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom) >= h->param.i_height )
@@ -927,7 +941,8 @@ static void mbcmp_init( x264_t *h )
      memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
      memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
      h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
-    h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
+    h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c;
+    h->pixf.intra_mbcmp_x3_8x8c  = satd ? h->pixf.intra_satd_x3_8x8c  : h->pixf.intra_sad_x3_8x8c;
      h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
      h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
      h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
@@ -938,6 +953,39 @@ static void mbcmp_init( x264_t *h )
      memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
  }
  
+static void chroma_dsp_init( x264_t *h )
+{
+    memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) );
+
+    switch( CHROMA_FORMAT )
+    {
+        case CHROMA_420:
+            memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) );
+            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420;
+            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra;
+            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff;
+            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff;
+            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c;
+            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4;
+            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4;
+            break;
+        case CHROMA_422:
+            memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) );
+            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422;
+            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra;
+            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff;
+            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff;
+            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c;
+            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8;
+            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8;
+            break;
+        case CHROMA_444:
+            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff;
+            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff;
+            break;
+    }
+}
+
  static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
  {
      /* VUI */
@@ -1039,6 +1087,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
      h->mb.i_mb_width = h->sps->i_mb_width;
      h->mb.i_mb_height = h->sps->i_mb_height;
      h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height;
+
+    h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
+    h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420;
+
      /* Adaptive MBAFF and subme 0 are not supported as we require halving motion
       * vectors during prediction, resulting in hpel mvs.
       * The chosen solution is to make MBAFF non-adaptive in this case. */
@@ -1092,6 +1144,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
      /* init CPU functions */
      x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
      x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
+    x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
      x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
      x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
      if( h->param.b_cabac )
@@ -1109,6 +1162,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
      x264_dct_init_weights();
  
      mbcmp_init( h );
+    chroma_dsp_init( h );
  
      p = buf + sprintf( buf, "using cpu capabilities:" );
      for( int i = 0; x264_cpu_names[i].flags; i++ )
@@ -1238,6 +1292,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
                            h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
                            h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
                            h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") :
+                          h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 == 1 ? "High 4:2:2 Intra" : "High 4:2:2") :
                            h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive";
      char level[4];
      snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
@@ -1252,8 +1307,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
      }
      else
      {
+        static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
          x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n",
-            profile, level, CHROMA444 ? "4:4:4" : "4:2:0", BIT_DEPTH );
+            profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH );
      }
  
      return h;
@@ -1776,7 +1832,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
       * consistency by copying deblocked pixels between planes. */
      if( PARAM_INTERLACED )
          for( int p = 0; p < h->fdec->i_plane; p++ )
-            for( int i = minpix_y>>(!CHROMA444 && p); i < maxpix_y>>(!CHROMA444 && p); i++ )
+            for( int i = minpix_y>>(h->mb.chroma_v_shift && p); i < maxpix_y>>(h->mb.chroma_v_shift && p); i++ )
                  memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
                          h->fdec->plane[p]     + i*h->fdec->i_stride[p],
                          h->mb.i_mb_width*16*sizeof(pixel) );
@@ -1815,10 +1871,11 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
              if( !CHROMA444 )
              {
                  uint64_t ssd_u, ssd_v;
+                int v_shift = h->mb.chroma_v_shift;
                  x264_pixel_ssd_nv12( &h->pixf,
-                    h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
-                    h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
-                    h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v );
+                    h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],
+                    h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],
+                    h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v );
                  h->stat.frame.i_ssd[1] += ssd_u;
                  h->stat.frame.i_ssd[2] += ssd_v;
              }
@@ -2263,7 +2320,7 @@ reencode:
                  else //if( h->mb.i_type == I_4x4 )
                      for( int i = 0; i < 16; i++ )
                          h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
-                h->stat.frame.i_mb_pred_mode[3][x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]]++;
+                h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++;
              }
              h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED;
          }
@@ -3141,7 +3198,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
              h->stat.frame.i_ssd[2],
          };
          int luma_size = h->param.i_width * h->param.i_height;
-        int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
+        int chroma_size = CHROMA_SIZE( luma_size );
          double psnr_y = x264_psnr( ssd[0], luma_size );
          double psnr_u = x264_psnr( ssd[1], chroma_size );
          double psnr_v = x264_psnr( ssd[2], chroma_size );
@@ -3232,9 +3289,7 @@ static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_p
   ****************************************************************************/
  void    x264_encoder_close  ( x264_t *h )
  {
-    int luma_size = h->param.i_width * h->param.i_height;
-    int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
-    int64_t i_yuv_size = luma_size + chroma_size * 2;
+    int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height );
      int64_t i_mb_count_size[2][7] = {{0}};
      char buf[200];
      int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM]
@@ -3470,7 +3525,7 @@ void    x264_encoder_close  ( x264_t *h )
          }
          for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ )
          {
-            fixed_pred_modes[3][x264_mb_pred_mode8x8c_fix[i]] += h->stat.i_mb_pred_mode[3][i];
+            fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i];
              sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i];
          }
          if( sum_pred_modes[3] && !CHROMA444 )
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index a8768c570956858e9aad7bc2327e6b44e3c84b38..0dfebb26219b3553bd13c54a762dac262c05ab3a 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -6,6 +6,7 @@
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
   *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -40,7 +41,19 @@ static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
  }
  #undef ZIG
  
-#define IDCT_DEQUANT_START \
+static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
+{
+    level[0] = dct[0];
+    level[1] = dct[2];
+    level[2] = dct[1];
+    level[3] = dct[4];
+    level[4] = dct[6];
+    level[5] = dct[3];
+    level[6] = dct[5];
+    level[7] = dct[7];
+}
+
+#define IDCT_DEQUANT_2X2_START \
      int d0 = dct[0] + dct[1]; \
      int d1 = dct[2] + dct[3]; \
      int d2 = dct[0] - dct[1]; \
@@ -49,21 +62,22 @@ static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
  
  static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  {
-    IDCT_DEQUANT_START
+    IDCT_DEQUANT_2X2_START
      dct4x4[0][0] = (d0 + d1) * dmf >> 5;
      dct4x4[1][0] = (d0 - d1) * dmf >> 5;
      dct4x4[2][0] = (d2 + d3) * dmf >> 5;
      dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  }
  
-static inline void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
+static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
  {
-    IDCT_DEQUANT_START
-    out[0] = (d0 + d1) * dmf >> 5;
-    out[1] = (d0 - d1) * dmf >> 5;
-    out[2] = (d2 + d3) * dmf >> 5;
-    out[3] = (d2 - d3) * dmf >> 5;
+    IDCT_DEQUANT_2X2_START
+    dct[0] = (d0 + d1) * dmf >> 5;
+    dct[1] = (d0 - d1) * dmf >> 5;
+    dct[2] = (d2 + d3) * dmf >> 5;
+    dct[3] = (d2 - d3) * dmf >> 5;
  }
+#undef IDCT_2X2_DEQUANT_START
  
  static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
  {
@@ -81,6 +95,23 @@ static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
      dct4x4[3][0] = 0;
  }
  
+static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
+{
+    if( WORD_SIZE == 8 )
+    {
+        for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
+            if( M64( &v[i] ) )
+                return 1;
+    }
+    else
+    {
+        for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
+            if( M32( &v[i] ) )
+                return 1;
+    }
+    return 0;
+}
+
  static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
  {
      int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
@@ -236,7 +267,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
              block_cbp |= nz;
          }
          h->mb.i_cbp_luma |= block_cbp * 0xf;
-        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4 );
+        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
          h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
          return;
      }
@@ -278,7 +309,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
  
      h->dctf.dct4x4dc( dct_dc4x4 );
      if( h->mb.b_trellis )
-        nz = x264_quant_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, 0, LUMA_DC+p );
+        nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
      else
          nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
  
@@ -306,7 +337,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
   * Unlike luma blocks, this can't be done with a lookup table or
   * other shortcut technique because of the interdependencies
   * between the coefficients due to the chroma DC transform. */
-static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp )
+static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
  {
      int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  
@@ -314,14 +345,18 @@ static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4
      if( dmf > 32*64 )
          return 1;
  
-    return h->quantf.optimize_chroma_dc( dct2x2, dmf );
+    if( chroma422 )
+        return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
+    else
+        return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
  }
  
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
+static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
  {
      int nz, nz_dc;
      int b_decimate = b_inter && h->mb.b_dct_decimate;
-    ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
+    int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
+    ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
      h->mb.i_cbp_chroma = 0;
      h->nr_count[2] += h->mb.b_noise_reduction * 4;
  
@@ -330,17 +365,26 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
       * Values are experimentally derived. */
      if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
      {
-        int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
          int ssd[2];
-        int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+        int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
+
+        int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
          if( score < thresh*4 )
-            score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+            score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
          if( score < thresh*4 )
          {
              M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
              M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
              M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
              M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+            if( chroma422 )
+            {
+                M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
+            }
              h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
              h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
  
@@ -348,20 +392,43 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
              {
                  if( ssd[ch] > thresh )
                  {
-                    h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+                    pixel *p_src = h->mb.pic.p_fenc[1+ch];
+                    pixel *p_dst = h->mb.pic.p_fdec[1+ch];
+
+                    if( chroma422 )
+                        /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
+                        h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
+                    else
+                        h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
+
                      if( h->mb.b_trellis )
-                        nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
+                        nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
                      else
-                        nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+                    {
+                        nz_dc = 0;
+                        for( int i = 0; i <= chroma422; i++ )
+                            nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
+                                                             h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
+                    }
  
                      if( nz_dc )
                      {
-                        if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+                        if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
                              continue;
                          h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
-                        zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
-                        h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+                        if( chroma422 )
+                        {
+                            zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+                            h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
+                        }
+                        else
+                        {
+                            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+                            idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
+                        }
+
+                        for( int i = 0; i <= chroma422; i++ )
+                            h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
                          h->mb.i_cbp_chroma = 1;
                      }
                  }
@@ -377,78 +444,120 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
          int i_decimate_score = 0;
          int nz_ac = 0;
  
-        ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+        ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
  
          if( h->mb.b_lossless )
          {
-            for( int i = 0; i < 4; i++ )
+            static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
+
+            for( int i = 0; i < (chroma422?8:4); i++ )
              {
-                int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
-                int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
-                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*16], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
-                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
+                int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
+                int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
+                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
+                                           &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
+                h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
                  h->mb.i_cbp_chroma |= nz;
              }
-            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch] );
+            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
              continue;
          }
  
-        h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+        for( int i = 0; i <= chroma422; i++ )
+            h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+
          if( h->mb.b_noise_reduction )
-            for( int i = 0; i < 4; i++ )
+            for( int i = 0; i < (chroma422?8:4); i++ )
                  h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-        dct2x2dc( dct2x2, dct4x4 );
+
+        if( chroma422 )
+            h->dctf.dct2x4dc( dct_dc, dct4x4 );
+        else
+            dct2x2dc( dct_dc, dct4x4 );
+
          /* calculate dct coeffs */
-        for( int i = 0; i < 4; i++ )
+        for( int i = 0; i < (chroma422?8:4); i++ )
          {
              if( h->mb.b_trellis )
                  nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
              else
                  nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
+            h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
              if( nz )
              {
                  nz_ac = 1;
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*16], dct4x4[i] );
-                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
+                h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
                  if( b_decimate )
-                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*16] );
+                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
              }
          }
  
          if( h->mb.b_trellis )
-            nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
+            nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
          else
-            nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+        {
+            nz_dc = 0;
+            for( int i = 0; i <= chroma422; i++ )
+                nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
+                                                 h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
+        }
  
          h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
  
          if( (b_decimate && i_decimate_score < 7) || !nz_ac )
          {
              /* Decimate the block */
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16+0+16*ch]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16+2+16*ch]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
+            if( chroma422 )
+            {
+                M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
+            }
+
              if( !nz_dc ) /* Whole block is empty */
                  continue;
-            if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+            if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
              {
                  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
                  continue;
              }
              /* DC-only */
-            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
-            h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
+            if( chroma422 )
+            {
+                zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+                h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
+            }
+            else
+            {
+                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+                idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
+            }
+
+            for( int i = 0; i <= chroma422; i++ )
+                h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
          }
          else
          {
              h->mb.i_cbp_chroma = 1;
+
              if( nz_dc )
              {
-                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                if( chroma422 )
+                {
+                    zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+                    h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
+                }
+                else
+                {
+                    zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+                    idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
+                }
              }
-            h->dctf.add8x8_idct( p_dst, dct4x4 );
+
+            for( int i = 0; i <= chroma422; i++ )
+                h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
          }
      }
  
@@ -457,6 +566,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
  }
  
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
+{
+    if( CHROMA_FORMAT == CHROMA_420 )
+        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
+    else
+        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
+}
+
  static void x264_macroblock_encode_skip( x264_t *h )
  {
      M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
@@ -467,7 +584,7 @@ static void x264_macroblock_encode_skip( x264_t *h )
      M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
      M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
      M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
-    if( CHROMA444 )
+    if( CHROMA_FORMAT >= CHROMA_422 )
      {
          M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
          M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
@@ -483,26 +600,32 @@ static void x264_macroblock_encode_skip( x264_t *h )
   * Intra prediction for predictive lossless mode.
   *****************************************************************************/
  
-void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
+void x264_predict_lossless_chroma( x264_t *h, int i_mode )
  {
+    int height = 16 >> h->mb.chroma_v_shift;
      if( i_mode == I_PRED_CHROMA_V )
      {
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 );
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
          memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
          memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
      }
      else if( i_mode == I_PRED_CHROMA_H )
      {
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 );
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
          x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
          x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
+        if( CHROMA_FORMAT == CHROMA_422 )
+        {
+            x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
+            x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
+        }
      }
      else
      {
-        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
      }
  }
  
@@ -563,8 +686,9 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
              h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
          if( chroma )
          {
-            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
-            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
+            int height = 16 >> h->mb.chroma_v_shift;
+            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
+            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
          }
          return;
      }
@@ -598,22 +722,26 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
  
              if( chroma )
              {
+                int v_shift = h->mb.chroma_v_shift;
+                int height = 16 >> v_shift;
+
                  /* Special case for mv0, which is (of course) very common in P-skip mode. */
                  if( mvx | mvy )
                      h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                       h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                                     mvx, mvy, 8, 8 );
+                                     mvx, 2*mvy>>v_shift, 8, height );
                  else
-                    h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+                    h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
+                                                         h->mb.pic.i_stride[1], height );
  
                  if( h->sh.weight[0][1].weightfn )
                      h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                                         h->mb.pic.p_fdec[1], FDEC_STRIDE,
-                                                       &h->sh.weight[0][1], 8 );
+                                                       &h->sh.weight[0][1], height );
                  if( h->sh.weight[0][2].weightfn )
                      h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                                         h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                                       &h->sh.weight[0][2], 8 );
+                                                       &h->sh.weight[0][2], height );
              }
          }
  
@@ -861,18 +989,18 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
      {
          if( IS_INTRA( h->mb.i_type ) )
          {
-            const int i_mode = h->mb.i_chroma_pred_mode;
+            int i_mode = h->mb.i_chroma_pred_mode;
              if( h->mb.b_lossless )
-                x264_predict_lossless_8x8_chroma( h, i_mode );
+                x264_predict_lossless_chroma( h, i_mode );
              else
              {
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
              }
          }
  
          /* encode the 8x8 blocks */
-        x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
+        x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
      }
      else
          h->mb.i_cbp_chroma = 0;
@@ -920,13 +1048,10 @@ void x264_macroblock_encode( x264_t *h )
   *****************************************************************************/
  static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
  {
-    ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
-    ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
+    ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
      ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
      ALIGNED_4( int16_t mvp[2] );
-
      int i_qp = h->mb.i_qp;
-    int thresh, ssd;
  
      for( int p = 0; p < plane_count; p++ )
      {
@@ -966,11 +1091,13 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
          i_qp = h->mb.i_chroma_qp;
      }
  
-    if( chroma )
+    if( chroma == CHROMA_420 || chroma == CHROMA_422 )
      {
-        /* encode chroma */
          i_qp = h->mb.i_chroma_qp;
-        thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int chroma422 = chroma == CHROMA_422;
+        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int ssd;
+        ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
  
          if( !b_bidir )
          {
@@ -978,9 +1105,10 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
              if( M32( mvp ) )
                  h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                   h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                                 mvp[0], mvp[1], 8, 8 );
+                                 mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
              else
-                h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+                h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
+                                                     h->mb.pic.i_stride[1], chroma422?16:8 );
          }
  
          for( int ch = 0; ch < 2; ch++ )
@@ -991,11 +1119,11 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
              if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
                  h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
                                                        h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
-                                                      &h->sh.weight[0][1+ch], 8 );
+                                                      &h->sh.weight[0][1+ch], chroma422?16:8 );
  
              /* there is almost never a termination during chroma, but we can't avoid the check entirely */
              /* so instead we check SSD and skip the actual check if the score is low enough. */
-            ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+            ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
              if( ssd < thresh )
                  continue;
  
@@ -1003,28 +1131,38 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
               * threshold check, so we can save time by doing a DC-only DCT. */
              if( h->mb.b_noise_reduction )
              {
-                h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
-                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                for( int i = 0; i <= chroma422; i++ )
+                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+
+                for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
                  {
                      h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                    dct2x2[i4x4] = dct4x4[i4x4][0];
+                    dct_dc[i4x4] = dct4x4[i4x4][0];
                  }
              }
              else
-                h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
+            {
+                if( chroma422 )
+                    h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
+                else
+                    h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
+            }
  
-            if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
-                return 0;
+            for( int i = 0; i <= chroma422; i++ )
+                if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
+                                            h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
+                    return 0;
  
              /* If there wasn't a termination in DC, we can check against a much higher threshold. */
              if( ssd < thresh*4 )
                  continue;
  
              if( !h->mb.b_noise_reduction )
-                h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+               for( int i = 0; i <= chroma422; i++ )
+                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
  
              /* calculate dct coeffs */
-            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
              {
                  dct4x4[i4x4][0] = 0;
                  if( h->mb.b_noise_reduction )
@@ -1045,10 +1183,12 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
  
  int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  {
-    if( CHROMA444 )
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, 0 );
+    if( CHROMA_FORMAT == CHROMA_444 )
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
+    else if( CHROMA_FORMAT == CHROMA_422 )
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
      else
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, 1 );
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
  }
  
  /****************************************************************************
@@ -1096,6 +1236,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
      int x = i8&1;
      int y = i8>>1;
      int nz;
+    int chroma422 = chroma == CHROMA_422;
  
      h->mb.i_cbp_chroma = 0;
      h->mb.i_cbp_luma &= ~(1 << i8);
@@ -1128,15 +1269,20 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
              }
              h->mb.i_cbp_luma |= nnz8x8 << i8;
          }
-        if( chroma )
+        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
          {
              for( int ch = 0; ch < 2; ch++ )
              {
                  dctcoef dc;
-                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*16], p_fenc, p_fdec, &dc );
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
+                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
+
+                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
+                {
+                    int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
+                    nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
+                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
+                }
              }
              h->mb.i_cbp_chroma = 0x02;
          }
@@ -1212,30 +1358,36 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
              }
          }
  
-        if( chroma )
+        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
          {
              i_qp = h->mb.i_chroma_qp;
              for( int ch = 0; ch < 2; ch++ )
              {
-                ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
-                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-                h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-                if( h->mb.b_noise_reduction )
-                    h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                dct4x4[0] = 0;
+                ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
+                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
  
-                if( h->mb.b_trellis )
-                    nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
-                else
-                    nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
-                if( nz )
+                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
                  {
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*16], dct4x4 );
-                    h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
-                    h->dctf.add4x4_idct( p_fdec, dct4x4 );
+                    h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
+
+                    if( h->mb.b_noise_reduction )
+                        h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
+                    dct4x4[i4x4][0] = 0;
+
+                    if( h->mb.b_trellis )
+                        nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
+                    else
+                        nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+
+                    int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
+                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
+                    if( nz )
+                    {
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
+                        h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
+                        h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
+                    }
                  }
              }
              h->mb.i_cbp_chroma = 0x02;
@@ -1246,9 +1398,11 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
  void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  {
      if( CHROMA444 )
-        x264_macroblock_encode_p8x8_internal( h, i8, 3, 0 );
+        x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
+    else if( CHROMA_FORMAT == CHROMA_422 )
+        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
      else
-        x264_macroblock_encode_p8x8_internal( h, i8, 1, 1 );
+        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
  }
  
  /*****************************************************************************
diff --git a/encoder/macroblock.h b/encoder/macroblock.h

index 5e3b188d09dfd07a419175672b767bd854d98b2d..d8ca95dc513cc2a37f2194ab4ae6ff65ec222ab7 100644 (file)
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -41,10 +41,10 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
  #define x264_macroblock_probe_bskip( h )\
      x264_macroblock_probe_skip( h, 1 )
  
-void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
  void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
  void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
  void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
+void x264_predict_lossless_chroma( x264_t *h, int i_mode );
  
  void x264_macroblock_encode      ( x264_t *h );
  void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
@@ -54,12 +54,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
  void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
  void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode );
  void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge );
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
  
  void x264_cabac_mb_skip( x264_t *h, int b_skip );
  
-int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
+                                int ctx_block_cat, int b_intra, int idx );
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
  int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                               int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
  int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
diff --git a/encoder/me.c b/encoder/me.c

index e21f2ca841d40dfc02a1b34f13f9545fb95ac44e..1c8c8bb376c39e4339a6f7892c6941f591b5cb62 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -805,17 +805,16 @@ if( b_refine_qpel || (dir^1) != odir ) \
          } \
          else \
          { \
-            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
+                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
              if( m->weight[1].weightfn ) \
-                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
-                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
-            cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
+                m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
+            cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
              if( cost < bcost ) \
              { \
                  if( m->weight[2].weightfn ) \
-                    m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
-                                                                          &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
-                cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+                    m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
+                cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
              } \
          } \
      } \
@@ -830,7 +829,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
      const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
      const int i_pixel = m->i_pixel;
      const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
-    const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    int chromapix = h->luma2chroma_pixel[i_pixel];
+    int chroma_v_shift = h->mb.chroma_v_shift;
+    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  
      ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
  
@@ -952,7 +953,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
          }\
          else\
              h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
-                             mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+                             mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
      }\
  }
  
@@ -976,14 +977,17 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
      ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
      ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
      pixel *src[3][2][9];
-    int chromasize = CHROMA444 ? 8 : 4;
+    int chromapix = h->luma2chroma_pixel[i_pixel];
+    int chroma_v_shift = h->mb.chroma_v_shift;
+    int chroma_x = (8 >> h->mb.chroma_h_shift) * x;
+    int chroma_y = (8 >> chroma_v_shift) * y;
      pixel *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
-    pixel *pixu = &h->mb.pic.p_fdec[1][chromasize*x + chromasize*y*FDEC_STRIDE];
-    pixel *pixv = &h->mb.pic.p_fdec[2][chromasize*x + chromasize*y*FDEC_STRIDE];
+    pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
+    pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
      int ref0 = h->mb.cache.ref[0][s8];
      int ref1 = h->mb.cache.ref[1][s8];
-    const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
      int stride[3][2][9];
      int bm0x = m0->mv[0];
      int bm0y = m0->mv[1];
@@ -1071,8 +1075,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
                          }
                          else
                          {
-                            h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
-                            h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+                            h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+                            h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
                          }
                          uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
                          COPY2_IF_LT( bcostrd, costrd, bestj, j );
@@ -1153,13 +1157,12 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
          } \
          else if( m->i_pixel <= PIXEL_8x8 ) \
          { \
-            h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
+                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
              if( m->weight[1].weightfn ) \
-                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, \
-                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
+                m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
              if( m->weight[2].weightfn ) \
-                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, \
-                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
+                m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
          } \
          cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
          COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
@@ -1173,7 +1176,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
      const int bw = x264_pixel_size[m->i_pixel].w;
      const int bh = x264_pixel_size[m->i_pixel].h;
      const int i_pixel = m->i_pixel;
-    const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    int chroma_v_shift = h->mb.chroma_v_shift;
+    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  
      uint64_t bcost = COST_MAX64;
      int bmx = m->mv[0];
@@ -1193,8 +1197,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
      }
      else
      {
-        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
-        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
      }
  
      h->mb.b_skip_mc = 1;
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index a64a9997244807c36e738bdd346cd94402aaab48..dfe522480e6f8e9359874aab6cbc0e788ff01242 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -219,18 +219,21 @@ static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_f
  
  static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
  {
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
      int stride = frame->i_stride[i];
      int offset = b_field
-        ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride
-        : 16 * mb_x + w * mb_y * stride;
+        ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride
+        : 16 * mb_x + height * mb_y * stride;
      stride <<= b_field;
      if( b_chroma )
      {
-        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
-        h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
-        return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1, b_store )
-             + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store );
+        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
+        int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+        int shift = 7 - h->mb.chroma_v_shift;
+
+        h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height );
+        return ac_energy_var( h->pixf.var[chromapix]( pix,               FENC_STRIDE ), shift, frame, 1, b_store )
+             + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store );
      }
      else
          return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store );
@@ -379,9 +382,8 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
      {
          uint64_t ssd = frame->i_pixel_ssd[i];
          uint64_t sum = frame->i_pixel_sum[i];
-        int size = CHROMA444 || !i ? 16 : 8;
-        int width = h->mb.i_mb_width*size;
-        int height = h->mb.i_mb_height*size;
+        int width  = 16*h->mb.i_mb_width  >> (i && h->mb.chroma_h_shift);
+        int height = 16*h->mb.i_mb_height >> (i && h->mb.chroma_v_shift);
          frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
      }
  }
@@ -1279,8 +1281,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
          if( h->param.b_bluray_compat )
              mincr = 4;
  
-        /* High 10 / High 4:4:4 Predictive doesn't require minCR, so just set the maximum to a large value. */
-        if( h->sps->i_profile_idc >= PROFILE_HIGH10 )
+        /* Profiles above High don't require minCR, so just set the maximum to a large value. */
+        if( h->sps->i_profile_idc > PROFILE_HIGH )
              rc->frame_size_maximum = 1e9;
          else
          {
diff --git a/encoder/rdo.c b/encoder/rdo.c

index f994fa02e19428a270682ffb15cbfa9664f5f313..4ca075084ffafc472d8430291fb6203872a5e2e2 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -146,7 +146,7 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
  
  static inline int ssd_mb( x264_t *h )
  {
-    int chroma_size = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
+    int chroma_size = h->luma2chroma_pixel[PIXEL_16x16];
      int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0);
      chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
      return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd;
@@ -227,7 +227,6 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
  {
      uint64_t i_ssd, i_bits;
      int i8 = i4 >> 2;
-    int chromassd;
  
      if( i_pixel == PIXEL_16x16 )
      {
@@ -246,19 +245,13 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
      if( i_pixel == PIXEL_8x16 )
          x264_macroblock_encode_p8x8( h, i8+2 );
  
-    i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 );
-    if( CHROMA444 )
-    {
-        chromassd = ssd_plane( h, i_pixel, 1, (i8&1)*8, (i8>>1)*8 )
-                  + ssd_plane( h, i_pixel, 2, (i8&1)*8, (i8>>1)*8 );
-    }
-    else
-    {
-        chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
-                  + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
-    }
-    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-    i_ssd += chromassd;
+    int ssd_x = 8*(i8&1);
+    int ssd_y = 8*(i8>>1);
+    i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y );
+    int chromapix = h->luma2chroma_pixel[i_pixel];
+    int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift )
+                  + ssd_plane( h, chromapix, 2, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift );
+    i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
  
      if( h->param.b_cabac )
      {
@@ -343,14 +336,16 @@ static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode
      return (i_ssd<<8) + i_bits;
  }
  
-static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
+static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
  {
      uint64_t i_ssd, i_bits;
  
      if( b_dct )
-        x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
-    i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
-            ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
+        x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp );
+
+    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+    i_ssd = ssd_plane( h, chromapix, 1, 0, 0 )
+          + ssd_plane( h, chromapix, 2, 0, 0 );
  
      h->mb.i_chroma_pred_mode = i_mode;
  
@@ -358,11 +353,11 @@ static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode,
      {
          x264_cabac_t cabac_tmp;
          COPY_CABAC;
-        x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
+        x264_chroma_size_cabac( h, &cabac_tmp );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
-        i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
+        i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;
  
      return (i_ssd<<8) + i_bits;
  }
@@ -443,7 +438,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                           int ctx_block_cat, int i_lambda2, int b_ac,
                           int b_chroma, int dc, int i_coefs, int idx )
  {
-    int abs_coefs[64], signs[64];
+    udctcoef abs_coefs[64];
+    int8_t signs[64];
      trellis_node_t nodes[2][8];
      trellis_node_t *nodes_cur = nodes[0];
      trellis_node_t *nodes_prev = nodes[1];
@@ -451,6 +447,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
      const int b_interlaced = MB_INTERLACED;
      uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
      uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    const uint8_t *levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
      const int f = 1 << 15; // no deadzone
      int i_last_nnz;
      int i;
@@ -486,7 +483,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
      {
          int coef = dct[zigzag[i]];
          abs_coefs[i] = abs(coef);
-        signs[i] = coef < 0 ? -1 : 1;
+        signs[i] = coef>>31 | 1;
      }
  
      /* init trellis */
@@ -519,7 +516,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
          {
              // no need to calculate ssd of 0s: it's the same in all nodes.
              // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
-            int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
+            int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+                           b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
              const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
                                       * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
              for( int j = 1; j < 8; j++ )
@@ -546,8 +544,10 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
  
          if( i < i_coefs-1 )
          {
-            int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
-            int lastindex = i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
+            int sigindex  = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+                            b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
+            int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] :
+                            b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
              cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
              cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
              cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 );
@@ -599,7 +599,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                          f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
                          if( i_prefix > 0 )
                          {
-                            uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
+                            uint8_t *ctx = &n.cabac_state[levelgt1_ctx[node_ctx]];
                              f8_bits += cabac_size_unary[i_prefix][*ctx];
                              *ctx = cabac_transition_unary[i_prefix][*ctx];
                              if( abs_level >= 15 )
@@ -695,7 +695,8 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
      int64_t score = 1ULL<<62;
      int i, j;
      const int f = 1<<15;
-    int nC = ctx_block_cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, ctx_block_cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];
+    int nC = b_chroma && dc ? 3 + (i_coefs>>2)
+                            : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )];
  
      /* Code for handling 8x8dct -> 4x4dct CAVLC munging.  Input/output use a different
       * step/start/end than internal processing. */
@@ -857,24 +858,46 @@ zeroblock:
      return 0;
  }
  
-const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
-
-int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                           int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx )
  {
      if( h->param.b_cabac )
          return quant_trellis_cabac( h, dct,
-            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
-            NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx );
+            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+            ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx );
+
+    return quant_trellis_cavlc( h, dct,
+        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+        DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 );
+}
  
-    if( ctx_block_cat != DCT_CHROMA_DC )
-        ctx_block_cat = DCT_LUMA_DC;
+static const uint8_t x264_zigzag_scan2x2[4] = { 0, 1, 2, 3 };
+static const uint8_t x264_zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 };
+
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx )
+{
+    const uint8_t *zigzag;
+    int num_coefs;
+    int quant_cat = CQM_4IC+1 - b_intra;
+
+    if( CHROMA_FORMAT == CHROMA_422 )
+    {
+        zigzag = x264_zigzag_scan2x4;
+        num_coefs = 8;
+    }
+    else
+    {
+        zigzag = x264_zigzag_scan2x2;
+        num_coefs = 4;
+    }
+
+    if( h->param.b_cabac )
+        return quant_trellis_cabac( h, dct,
+            h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+            DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx );
  
      return quant_trellis_cavlc( h, dct,
-        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
-        NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
-        ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx, 0 );
+        h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+        DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 );
  }
  
  int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
diff --git a/encoder/set.c b/encoder/set.c

index a498c9456f5e098f441b4f764534de3bc8885278..5e1ff642a53e68fdaac93a874b9d893674f309c4 100644 (file)
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -104,11 +104,14 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
      sps->i_id = i_id;
      sps->i_mb_width = ( param->i_width + 15 ) / 16;
      sps->i_mb_height= ( param->i_height + 15 ) / 16;
-    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? 3 : 1;
+    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
+                               csp >= X264_CSP_I422 ? CHROMA_422 : CHROMA_420;
  
      sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
-    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == 3 )
+    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
          sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
+    else if( sps->i_chroma_format_idc == CHROMA_422 )
+        sps->i_profile_idc  = PROFILE_HIGH422;
      else if( BIT_DEPTH > 8 )
          sps->i_profile_idc  = PROFILE_HIGH10;
      else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
@@ -132,11 +135,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
          sps->b_constraint_set3 = 1; /* level 1b with Baseline, Main or Extended profile is signalled via constraint_set3 */
          sps->i_level_idc      = 11;
      }
-    /* High 10 Intra profile */
-    if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH10 )
-        sps->b_constraint_set3 = 1;
-    /* High 4:4:4 Intra profile */
-    if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH444_PREDICTIVE )
+    /* Intra profiles */
+    if( param->i_keyint_max == 1 && sps->i_profile_idc > PROFILE_HIGH )
          sps->b_constraint_set3 = 1;
  
      sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
@@ -302,11 +302,12 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
      bs_write1( s, sps->b_crop );
      if( sps->b_crop )
      {
-        int cropshift = sps->i_chroma_format_idc != 3;
-        bs_write_ue( s, sps->crop.i_left   >> cropshift );
-        bs_write_ue( s, sps->crop.i_right  >> cropshift );
-        bs_write_ue( s, sps->crop.i_top    >> cropshift );
-        bs_write_ue( s, sps->crop.i_bottom >> cropshift );
+        int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
+        int v_shift = sps->i_chroma_format_idc == CHROMA_420;
+        bs_write_ue( s, sps->crop.i_left   >> h_shift );
+        bs_write_ue( s, sps->crop.i_right  >> h_shift );
+        bs_write_ue( s, sps->crop.i_top    >> v_shift );
+        bs_write_ue( s, sps->crop.i_bottom >> v_shift );
      }
  
      bs_write1( s, sps->b_vui );
@@ -757,7 +758,7 @@ int x264_validate_levels( x264_t *h, int verbose )
      int ret = 0;
      int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
      int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
-    int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH444_PREDICTIVE ? 16 :
+    int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
                       h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
                       h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
  
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index 5a91c167f33d27194631df9fb20fc72edc21937e..0acda252283610fa82693fe8cd8546c917cac5cb 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -96,12 +96,11 @@ static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc
      return ref->lowres[0];
  }
  
-/* How data is organized for chroma weightp 4:2:0:
+/* How data is organized for 4:2:0/4:2:2 chroma weightp:
   * [U: ref] [U: fenc]
   * [V: ref] [V: fenc]
   * fenc = ref + offset
- * v = u + stride * chroma height
- * We'll need more room if we do 4:2:2. */
+ * v = u + stride * chroma height */
  
  static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
  {
@@ -110,21 +109,23 @@ static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc
      int i_offset = i_stride / 2;
      int i_lines = fenc->i_lines[1];
      int i_width = fenc->i_width[1];
-    int cw = h->mb.i_mb_width  << 3;
-    int ch = h->mb.i_mb_height << 3;
+    int v_shift = h->mb.chroma_v_shift;
+    int cw = 8*h->mb.i_mb_width;
+    int ch = 16*h->mb.i_mb_height >> v_shift;
+    int height = 16 >> v_shift;
  
      if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
      {
          x264_frame_expand_border_chroma( h, ref, 1 );
-        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
+        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride )
              for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 )
              {
                  pixel *pixu = dstu + pel_offset_y + pel_offset_x;
                  pixel *pixv = dstv + pel_offset_y + pel_offset_x;
-                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12 */
+                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */
                  int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0];
                  int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1];
-                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, mvy, 8, 8 );
+                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height );
              }
      }
      else
@@ -223,15 +224,17 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
      int i_lines = fenc->i_lines[1];
      int i_width = fenc->i_width[1];
      pixel *src = ref + i_offset;
-    ALIGNED_ARRAY_16( pixel, buf, [8*8] );
+    ALIGNED_ARRAY_16( pixel, buf, [8*16] );
      int pixoff = 0;
+    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+    int height = 16 >> h->mb.chroma_v_shift;
      ALIGNED_16( static pixel flat[8] ) = {0};
      if( w )
      {
-        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
              for( int x = 0; x < i_width; x += 8, pixoff += 8 )
              {
-                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, 8 );
+                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height );
                  /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
                   * But testing shows that for chroma the DC coefficient is by far the most
                   * important part of the coding cost.  Thus a more useful chroma weight is
@@ -239,16 +242,16 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
                   * pixels.
                   *
                   * FIXME: add a (faster) asm sum function to replace sad. */
-                cost += abs( h->pixf.sad_aligned[PIXEL_8x8](          buf,        8, flat, 0 ) -
-                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+                cost += abs( h->pixf.sad_aligned[chromapix](          buf,        8, flat, 0 ) -
+                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
              }
          cost += x264_weight_slice_header_cost( h, w, 1 );
      }
      else
-        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
              for( int x = 0; x < i_width; x += 8, pixoff += 8 )
-                cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( &ref[pixoff], i_stride, flat, 0 ) -
-                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+                cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
+                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
      x264_emms();
      return cost;
  }
diff --git a/filters/video/depth.c b/filters/video/depth.c

index 25dde257f89de189c995efd56648ad5f21346834..9ea2cbcf86ae701884b933ec1eeeddf038d00876 100644 (file)
--- a/filters/video/depth.c
+++ b/filters/video/depth.c
@@ -46,15 +46,17 @@ static int depth_filter_csp_is_supported( int csp )
      return csp_mask == X264_CSP_I420 ||
             csp_mask == X264_CSP_I422 ||
             csp_mask == X264_CSP_I444 ||
-           csp_mask == X264_CSP_YV24 ||
             csp_mask == X264_CSP_YV12 ||
-           csp_mask == X264_CSP_NV12;
+           csp_mask == X264_CSP_YV16 ||
+           csp_mask == X264_CSP_YV24 ||
+           csp_mask == X264_CSP_NV12 ||
+           csp_mask == X264_CSP_NV16;
  }
  
  static int csp_num_interleaved( int csp, int plane )
  {
      int csp_mask = csp & X264_CSP_MASK;
-    return ( csp_mask == X264_CSP_NV12 && plane == 1 ) ? 2 : 1;
+    return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1;
  }
  
  /* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
diff --git a/filters/video/resize.c b/filters/video/resize.c

index 878a4d77bb0f051b07fc6a0a76908b9b8ff7361e..876872686fba47001c2527a4dee1f84c675a10b4 100644 (file)
--- a/filters/video/resize.c
+++ b/filters/video/resize.c
@@ -137,6 +137,7 @@ static int convert_csp_to_pix_fmt( int csp )
      {
          case X264_CSP_YV12: /* specially handled via swapping chroma */
          case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV420P16 : PIX_FMT_YUV420P;
+        case X264_CSP_YV16: /* specially handled via swapping chroma */
          case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV422P16 : PIX_FMT_YUV422P;
          case X264_CSP_YV24: /* specially handled via swapping chroma */
          case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P;
@@ -467,11 +468,11 @@ static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x2
      h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
      h->scale = h->dst;
  
-    /* swap chroma planes if YV12/YV24 is involved, as libswscale works with I420/I444 */
+    /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */
      int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER);
      int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER);
-    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV24;
-    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV24;
+    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24;
+    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24;
  
      int src_pix_fmt = convert_csp_to_pix_fmt( info->csp );
  
diff --git a/input/avs.c b/input/avs.c

index 59fab8cc5c682318d3454e9232838c55f52ed492..0169746dbd2adb2219dd77ee21387e17805ee23a 100644 (file)
--- a/input/avs.c
+++ b/input/avs.c
@@ -219,15 +219,22 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
      }
  #if !HAVE_SWSCALE
      /* if swscale is not available, convert the CSP if necessary */
-    if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) ||
-        (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
+    if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) ||
+        (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
      {
-        FAIL_IF_ERROR( avs_version < 2.6f && opt->output_csp == X264_CSP_I444, "avisynth >= 2.6 is required for i444 output\n" )
+        FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
+                       "avisynth >= 2.6 is required for i422/i444 output\n" )
  
-        const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : (opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB");
+        const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
+                          opt->output_csp == X264_CSP_I422 ? "YV16" :
+                          opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB";
          x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp );
-        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && (vi->width&1 || vi->height&1),
-                       "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1),
+                       "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3),
+                       "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1),
+                       "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
          const char *arg_name[2] = { NULL, "interlaced" };
          AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
          char conv_func[14] = { "ConvertTo" };
@@ -251,13 +258,13 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
          info->csp = X264_CSP_BGR | X264_CSP_VFLIP;
      else if( avs_is_yv24( vi ) )
          info->csp = X264_CSP_I444;
+    else if( avs_is_yv16( vi ) )
+        info->csp = X264_CSP_I422;
      else if( avs_is_yv12( vi ) )
          info->csp = X264_CSP_I420;
  #if HAVE_SWSCALE
      else if( avs_is_yuy2( vi ) )
          info->csp = PIX_FMT_YUYV422 | X264_CSP_OTHER;
-    else if( avs_is_yv16( vi ) )
-        info->csp = X264_CSP_I422;
      else if( avs_is_yv411( vi ) )
          info->csp = PIX_FMT_YUV411P | X264_CSP_OTHER;
      else if( avs_is_y8( vi ) )
diff --git a/input/input.c b/input/input.c

index 084499ae24494e0a381e019df3593ef15af68fce..27c2c3df91d2b72c3352ddf8def49d5428901601 100644 (file)
--- a/input/input.c
+++ b/input/input.c
@@ -29,9 +29,11 @@ const x264_cli_csp_t x264_cli_csps[] = {
      [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
      [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
      [X264_CSP_I444] = { "i444", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
-    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
      [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
+    [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
+    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
      [X264_CSP_NV12] = { "nv12", 2, { 1,  1 },     { 1, .5 },     2, 2 },
+    [X264_CSP_NV16] = { "nv16", 2, { 1,  1 },     { 1,  1 },     2, 1 },
      [X264_CSP_BGR]  = { "bgr",  1, { 3 },         { 1 },         1, 1 },
      [X264_CSP_BGRA] = { "bgra", 1, { 4 },         { 1 },         1, 1 },
      [X264_CSP_RGB]  = { "rgb",  1, { 3 },         { 1 },         1, 1 },
diff --git a/input/input.h b/input/input.h

index 4a4bb0b2e8a2d9dd0ee3b326e19dd7cdd686fd0f..bd7e4218ca179b4156f96f7a52006c58a1785515 100644 (file)
--- a/input/input.h
+++ b/input/input.h
@@ -103,8 +103,7 @@ extern cli_input_t timecode_input;
  extern cli_input_t cli_input;
  
  /* extended colorspace list that isn't supported by libx264 but by the cli */
-#define X264_CSP_I422           X264_CSP_MAX     /* yuv 4:2:2 planar    */
-#define X264_CSP_CLI_MAX       (X264_CSP_MAX+1)  /* end of list         */
+#define X264_CSP_CLI_MAX        X264_CSP_MAX     /* end of list         */
  #define X264_CSP_OTHER          0x4000           /* non x264 colorspace */
  
  typedef struct
diff --git a/tools/checkasm.c b/tools/checkasm.c

index bb1fafc8758133211e328dc507d6db0a80b1a5cc..0eb1ed54e3a8a2ad643edb57e40b888a8d1c8959 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -77,11 +77,12 @@ const char *bench_pattern = "";
  char func_name[100];
  static bench_func_t benchs[MAX_FUNCS];
  
-static const char *pixel_names[10] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x2", "2x4", "2x2" };
+static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
  static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
  static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
  static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
  static const char **intra_predict_8x8_names = intra_predict_4x4_names;
+static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
  
  #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
  
@@ -274,7 +275,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
  
  #define TEST_PIXEL( name, align ) \
      ok = 1, used_asm = 0; \
-    for( int i = 0; i < 7; i++ ) \
+    for( int i = 0; i < 8; i++ ) \
      { \
          int res_c, res_asm; \
          if( pixel_asm.name[i] != pixel_ref.name[i] ) \
@@ -374,24 +375,28 @@ static int check_pixel( int cpu_ref, int cpu_new )
  
      ok = 1; used_asm = 0;
      TEST_PIXEL_VAR( PIXEL_16x16 );
+    TEST_PIXEL_VAR( PIXEL_8x16 );
      TEST_PIXEL_VAR( PIXEL_8x8 );
      report( "pixel var :" );
  
-    ok = 1; used_asm = 0;
-    if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
-    {
-        int res_c, res_asm, ssd_c, ssd_asm;
-        set_func_name( "var2_8x8" );
-        used_asm = 1;
-        res_c   = call_c( pixel_c.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_c );
-        res_asm = call_a( pixel_asm.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_asm );
-        if( res_c != res_asm || ssd_c != ssd_asm )
-        {
-            ok = 0;
-            fprintf( stderr, "var2_8x8: %d != %d or %d != %d [FAILED]\n", res_c, res_asm, ssd_c, ssd_asm );
-        }
+#define TEST_PIXEL_VAR2( i ) \
+    if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
+    { \
+        int res_c, res_asm, ssd_c, ssd_asm; \
+        set_func_name( "%s_%s", "var2", pixel_names[i] ); \
+        used_asm = 1; \
+        res_c   = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \
+        res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \
+        if( res_c != res_asm || ssd_c != ssd_asm ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
+        } \
      }
  
+    ok = 1; used_asm = 0;
+    TEST_PIXEL_VAR2( PIXEL_8x16 );
+    TEST_PIXEL_VAR2( PIXEL_8x8 );
      report( "pixel var2 :" );
  
      ok = 1; used_asm = 0;
@@ -490,12 +495,14 @@ static int check_pixel( int cpu_ref, int cpu_new )
      memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
      ok = 1; used_asm = 0;
      TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
+    TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
      TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
      TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
      TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
      report( "intra satd_x3 :" );
      ok = 1; used_asm = 0;
      TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
+    TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
      TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
      TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
      TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
@@ -597,7 +604,7 @@ static int check_dct( int cpu_ref, int cpu_new )
      ALIGNED_16( dctcoef dct2[16][16] );
      ALIGNED_16( dctcoef dct4[16][16] );
      ALIGNED_16( dctcoef dct8[4][64] );
-    ALIGNED_16( dctcoef dctdc[2][4] );
+    ALIGNED_16( dctcoef dctdc[2][8] );
      x264_t h_buf;
      x264_t *h = &h_buf;
  
@@ -671,6 +678,7 @@ static int check_dct( int cpu_ref, int cpu_new )
      TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
      TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
      TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
+    TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
      TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
      report( "sub_dct4 :" );
  
@@ -757,6 +765,36 @@ static int check_dct( int cpu_ref, int cpu_new )
      TEST_DCTDC( idct4x4dc );
  #undef TEST_DCTDC
  
+#define TEST_DCTDC_CHROMA( name )\
+    ok = 1; used_asm = 0;\
+    if( dct_asm.name != dct_ref.name )\
+    {\
+        set_func_name( #name );\
+        used_asm = 1;\
+        uint16_t *p = (uint16_t*)buf1;\
+        for( int i = 0; i < 16 && ok; i++ )\
+        {\
+            for( int j = 0; j < 8; j++ )\
+                dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
+                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
+                           : ((*p++)&0x1fff)-0x1000; /* general case */\
+            memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
+            call_c1( dct_c.name, dctdc[0], dct1 );\
+            call_a1( dct_asm.name, dctdc[1], dct2 );\
+            if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
+            {\
+                ok = 0;\
+                fprintf( stderr, #name " [FAILED]\n" ); \
+            }\
+        }\
+        call_c2( dct_c.name, dctdc[0], dct1 );\
+        call_a2( dct_asm.name, dctdc[1], dct2 );\
+    }\
+    report( #name " :" );
+
+    TEST_DCTDC_CHROMA( dct2x4dc );
+#undef TEST_DCTDC_CHROMA
+
      x264_zigzag_function_t zigzag_c[2];
      x264_zigzag_function_t zigzag_ref[2];
      x264_zigzag_function_t zigzag_asm[2];
@@ -986,7 +1024,7 @@ static int check_mc( int cpu_ref, int cpu_new )
  #define MC_TEST_AVG( name, weight ) \
  { \
      ok = 1, used_asm = 0; \
-    for( int i = 0; i < 10; i++ ) \
+    for( int i = 0; i < 12; i++ ) \
      { \
          memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
          memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
@@ -1085,34 +1123,49 @@ static int check_mc( int cpu_ref, int cpu_new )
      report( "mc offsetsub :" );
  
      ok = 1; used_asm = 0;
-    if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 )
-    {
-        set_func_name( "store_interleave_8x8x2" );
-        used_asm = 1;
-        memset( pbuf3, 0, 64*8 );
-        memset( pbuf4, 0, 64*8 );
-        call_c( mc_c.store_interleave_8x8x2, pbuf3, 64, pbuf1, pbuf1+16 );
-        call_a( mc_a.store_interleave_8x8x2, pbuf4, 64, pbuf1, pbuf1+16 );
-        if( memcmp( pbuf3, pbuf4, 64*8 ) )
-            ok = 0;
-    }
-    if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc )
-    {
-        set_func_name( "load_deinterleave_8x8x2_fenc" );
-        used_asm = 1;
-        call_c( mc_c.load_deinterleave_8x8x2_fenc, pbuf3, pbuf1, 64 );
-        call_a( mc_a.load_deinterleave_8x8x2_fenc, pbuf4, pbuf1, 64 );
-        if( memcmp( pbuf3, pbuf4, FENC_STRIDE*8 ) )
-            ok = 0;
-    }
-    if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec )
+    for( int height = 8; height <= 16; height += 8 )
      {
-        set_func_name( "load_deinterleave_8x8x2_fdec" );
-        used_asm = 1;
-        call_c( mc_c.load_deinterleave_8x8x2_fdec, pbuf3, pbuf1, 64 );
-        call_a( mc_a.load_deinterleave_8x8x2_fdec, pbuf4, pbuf1, 64 );
-        if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*8 ) )
-            ok = 0;
+        if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
+        {
+            set_func_name( "store_interleave_chroma" );
+            used_asm = 1;
+            memset( pbuf3, 0, 64*height );
+            memset( pbuf4, 0, 64*height );
+            call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height );
+            call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height );
+            if( memcmp( pbuf3, pbuf4, 64*height ) )
+            {
+                ok = 0;
+                fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
+                break;
+            }
+        }
+        if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
+        {
+            set_func_name( "load_deinterleave_chroma_fenc" );
+            used_asm = 1;
+            call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height );
+            call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height );
+            if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
+            {
+                ok = 0;
+                fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
+                break;
+            }
+        }
+        if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
+        {
+            set_func_name( "load_deinterleave_chroma_fdec" );
+            used_asm = 1;
+            call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height );
+            call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height );
+            if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
+            {
+                ok = 0;
+                fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
+                break;
+            }
+        }
      }
      report( "store_interleave :" );
  
@@ -1411,11 +1464,13 @@ static int check_deblock( int cpu_ref, int cpu_new )
  
      TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
      TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
-    TEST_DEBLOCK( deblock_chroma[0], 0, tcs[i] );
+    TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
+    TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
      TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
      TEST_DEBLOCK( deblock_luma_intra[0], 0 );
      TEST_DEBLOCK( deblock_luma_intra[1], 1 );
-    TEST_DEBLOCK( deblock_chroma_intra[0], 0 );
+    TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
+    TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
      TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
  
      if( db_a.deblock_strength != db_ref.deblock_strength )
@@ -1471,6 +1526,8 @@ static int check_quant( int cpu_ref, int cpu_new )
      x264_quant_function_t qf_a;
      ALIGNED_16( dctcoef dct1[64] );
      ALIGNED_16( dctcoef dct2[64] );
+    ALIGNED_16( dctcoef dct3[8][16] );
+    ALIGNED_16( dctcoef dct4[8][16] );
      ALIGNED_16( uint8_t cqm_buf[64] );
      int ret = 0, ok, used_asm;
      int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
@@ -1602,7 +1659,7 @@ static int check_quant( int cpu_ref, int cpu_new )
              for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
              { \
                  INIT_QUANT##w(1) \
-                call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                  memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                  call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                  call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
@@ -1631,7 +1688,7 @@ static int check_quant( int cpu_ref, int cpu_new )
              { \
                  for( int i = 0; i < 16; i++ ) \
                      dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
-                call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
+                qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
                  memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                  call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                  call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
@@ -1647,27 +1704,75 @@ static int check_quant( int cpu_ref, int cpu_new )
  
          TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
  
-#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \
+        if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
+        {
+            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
+            used_asms[1] = 1;
+            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
+            {
+                for( int i = 0; i < 8; i++ )
+                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
+                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
+                call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
+                for( int i = 0; i < 8; i++ )
+                    if( dct3[i][0] != dct4[i][0] )
+                    {
+                        oks[1] = 0;
+                        fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
+                        break;
+                    }
+            }
+        }
+
+        if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
+        {
+            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
+            used_asms[1] = 1;
+            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
+            {
+                for( int i = 0; i < 8; i++ )
+                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
+                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                memcpy( dct2, dct1, 8*sizeof(dctcoef) );
+                call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
+                call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
+                if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
+                {
+                    oks[1] = 0;
+                    fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
+                    break;
+                }
+                call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
+                call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
+            }
+        }
+
+#define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
          if( qf_a.optname != qf_ref.optname ) \
          { \
              set_func_name( #optname ); \
              used_asms[2] = 1; \
              for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
              { \
-                int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \
+                int qpdc = qp + (size == 8 ? 3 : 0); \
+                int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
                  if( dmf > 32*64 ) \
                      continue; \
-                for( int i = 16; ; i <<= 1 )\
+                for( int i = 16; ; i <<= 1 ) \
                  { \
                      int res_c, res_asm; \
                      int max = X264_MIN( i, PIXEL_MAX*16 ); \
-                    for( int j = 0; j < w*w; j++ ) \
+                    for( int j = 0; j < size; j++ ) \
                          dct1[j] = rand()%(max*2+1) - max; \
-                    call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \
-                    memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
+                    for( int j = 0; i <= size; j += 4 ) \
+                        qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
+                    memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
                      res_c   = call_c1( qf_c.optname, dct1, dmf ); \
                      res_asm = call_a1( qf_a.optname, dct2, dmf ); \
-                    if( res_c != res_asm || memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
+                    if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
                      { \
                          oks[2] = 0; \
                          fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
@@ -1680,7 +1785,8 @@ static int check_quant( int cpu_ref, int cpu_new )
              } \
          }
  
-        TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 );
+        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
+        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );
  
          x264_cqm_delete( h );
      }
@@ -1751,7 +1857,7 @@ static int check_quant( int cpu_ref, int cpu_new )
      TEST_DECIMATE( decimate_score15, 4, 1, 7 );
      report( "decimate_score :" );
  
-#define TEST_LAST( last, lastname, w, ac ) \
+#define TEST_LAST( last, lastname, size, ac ) \
      if( qf_a.last != qf_ref.last ) \
      { \
          set_func_name( #lastname ); \
@@ -1759,8 +1865,8 @@ static int check_quant( int cpu_ref, int cpu_new )
          for( int i = 0; i < 100; i++ ) \
          { \
              int nnz = 0; \
-            int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
+            int max = rand() & (size-1); \
+            memset( dct1, 0, size*sizeof(dctcoef) ); \
              for( int idx = ac; idx < max; idx++ ) \
                  nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
              if( !nnz ) \
@@ -1777,13 +1883,14 @@ static int check_quant( int cpu_ref, int cpu_new )
      }
  
      ok = 1; used_asm = 0;
-    TEST_LAST( coeff_last[DCT_CHROMA_DC],  coeff_last4, 2, 0 );
-    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 4, 1 );
-    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
-    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
+    TEST_LAST( coeff_last4              , coeff_last4,   4, 0 );
+    TEST_LAST( coeff_last8              , coeff_last8,   8, 0 );
+    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 16, 1 );
+    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
+    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
      report( "coeff_last :" );
  
-#define TEST_LEVELRUN( lastname, name, w, ac ) \
+#define TEST_LEVELRUN( lastname, name, size, ac ) \
      if( qf_a.lastname != qf_ref.lastname ) \
      { \
          set_func_name( #name ); \
@@ -1792,8 +1899,8 @@ static int check_quant( int cpu_ref, int cpu_new )
          { \
              x264_run_level_t runlevel_c, runlevel_a; \
              int nnz = 0; \
-            int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
+            int max = rand() & (size-1); \
+            memset( dct1, 0, size*sizeof(dctcoef) ); \
              memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
              memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
              for( int idx = ac; idx < max; idx++ ) \
@@ -1814,9 +1921,10 @@ static int check_quant( int cpu_ref, int cpu_new )
      }
  
      ok = 1; used_asm = 0;
-    TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC],  coeff_level_run4, 2, 0 );
-    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 4, 1 );
-    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
+    TEST_LEVELRUN( coeff_level_run4              , coeff_level_run4,   4, 0 );
+    TEST_LEVELRUN( coeff_level_run8              , coeff_level_run8,   8, 0 );
+    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 16, 1 );
+    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
      report( "coeff_level_run :" );
  
      return ret;
@@ -1832,6 +1940,7 @@ static int check_intra( int cpu_ref, int cpu_new )
      {
          x264_predict_t      predict_16x16[4+3];
          x264_predict_t      predict_8x8c[4+3];
+        x264_predict_t      predict_8x16c[4+3];
          x264_predict8x8_t   predict_8x8[9+3];
          x264_predict_t      predict_4x4[9+3];
          x264_predict_8x8_filter_t predict_8x8_filter;
@@ -1839,16 +1948,19 @@ static int check_intra( int cpu_ref, int cpu_new )
  
      x264_predict_16x16_init( 0, ip_c.predict_16x16 );
      x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
+    x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
      x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
      x264_predict_4x4_init( 0, ip_c.predict_4x4 );
  
      x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
      x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
+    x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
      x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
      x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
  
      x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
      x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
+    x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
      x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
      x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
  
@@ -1856,7 +1968,7 @@ static int check_intra( int cpu_ref, int cpu_new )
  
      ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
  
-#define INTRA_TEST( name, dir, w, align, bench, ... )\
+#define INTRA_TEST( name, dir, w, h, align, bench, ... )\
      if( ip_a.name[dir] != ip_ref.name[dir] )\
      {\
          set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
@@ -1874,7 +1986,7 @@ static int check_intra( int cpu_ref, int cpu_new )
                  for( int k = -1; k < 16; k++ )\
                      printf( "%2x ", edge[16+k] );\
                  printf( "\n" );\
-                for( int j = 0; j < w; j++ )\
+                for( int j = 0; j < h; j++ )\
                  {\
                      printf( "%2x ", edge[14-j] );\
                      for( int k = 0; k < w; k++ )\
@@ -1882,7 +1994,7 @@ static int check_intra( int cpu_ref, int cpu_new )
                      printf( "\n" );\
                  }\
                  printf( "\n" );\
-                for( int j = 0; j < w; j++ )\
+                for( int j = 0; j < h; j++ )\
                  {\
                      printf( "   " );\
                      for( int k = 0; k < w; k++ )\
@@ -1895,13 +2007,15 @@ static int check_intra( int cpu_ref, int cpu_new )
      }
  
      for( int i = 0; i < 12; i++ )
-        INTRA_TEST(   predict_4x4, i,  4,  4, );
+        INTRA_TEST(   predict_4x4, i,  4,  4,  4, );
+    for( int i = 0; i < 7; i++ )
+        INTRA_TEST(  predict_8x8c, i,  8,  8, 16, );
      for( int i = 0; i < 7; i++ )
-        INTRA_TEST(  predict_8x8c, i,  8, 16, );
+        INTRA_TEST( predict_8x16c, i,  8, 16, 16, );
      for( int i = 0; i < 7; i++ )
-        INTRA_TEST( predict_16x16, i, 16, 16, );
+        INTRA_TEST( predict_16x16, i, 16, 16, 16, );
      for( int i = 0; i < 12; i++ )
-        INTRA_TEST(   predict_8x8, i,  8,  8, , edge );
+        INTRA_TEST(   predict_8x8, i,  8,  8,  8, , edge );
  
      set_func_name("intra_predict_8x8_filter");
      if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
@@ -1926,31 +2040,33 @@ static int check_intra( int cpu_ref, int cpu_new )
          }
      }
  
-#define EXTREMAL_PLANE(size) \
+#define EXTREMAL_PLANE( w, h ) \
      { \
          int max[7]; \
          for( int j = 0; j < 7; j++ ) \
              max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
          fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
-        for( int j = 0; j < size/2; j++ ) \
+        for( int j = 0; j < w/2; j++ ) \
              fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
-        for( int j = size/2; j < size-1; j++ ) \
+        for( int j = w/2; j < w-1; j++ ) \
              fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
-        fdec[48+(size-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
-        for( int j = 0; j < size/2; j++ ) \
+        fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
+        for( int j = 0; j < h/2; j++ ) \
              fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
-        for( int j = size/2; j < size-1; j++ ) \
+        for( int j = h/2; j < h-1; j++ ) \
              fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
-        fdec[48+(size-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
+        fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
      }
      /* Extremal test case for planar prediction. */
      for( int test = 0; test < 100 && ok; test++ )
          for( int i = 0; i < 128 && ok; i++ )
          {
-            EXTREMAL_PLANE(  8 );
-            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8, 64, 1 );
-            EXTREMAL_PLANE( 16 );
-            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 64, 1 );
+            EXTREMAL_PLANE(  8,  8 );
+            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8,  8, 64, 1 );
+            EXTREMAL_PLANE(  8, 16 );
+            INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P,  8, 16, 64, 1 );
+            EXTREMAL_PLANE( 16, 16 );
+            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 16, 64, 1 );
          }
      report( "intra pred :" );
      return ret;
diff --git a/x264.c b/x264.c

index 72399569f599f6d8770e1cfb3f789090ecc9aa99..025bc767a010921fa72dfff9b08ea0dcca94d128 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -121,7 +121,7 @@ static const char * const muxer_names[] =
  
  static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
  static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
-static const char * const output_csp_names[] = { "i420", "i444", "rgb", 0 };
+static const char * const output_csp_names[] = { "i420", "i422", "i444", "rgb", 0 };
  
  typedef struct
  {
@@ -1131,6 +1131,8 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
      int csp = info->csp & X264_CSP_MASK;
      if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
          param->i_csp = X264_CSP_I420;
+    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
+        param->i_csp = X264_CSP_I422;
      else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
          param->i_csp = X264_CSP_I444;
      else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
@@ -1355,7 +1357,8 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
              case OPT_OUTPUT_CSP:
                  FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg )
                  // correct the parsed value to the libx264 csp value
-                output_csp = !output_csp ? X264_CSP_I420 : (output_csp == 1 ? X264_CSP_I444 : X264_CSP_RGB);
+                static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB };
+                param->i_csp = output_csp = output_csp_fix[output_csp];
                  break;
              default:
  generic_option:
diff --git a/x264.h b/x264.h

index 2cdcfb7c9f33ceaa5f44bbce9eb658c87db76de7..34ad872c888ac6fd90cf1882bba90f039a1f2432 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
  
  #include "x264_config.h"
  
-#define X264_BUILD 117
+#define X264_BUILD 118
  
  /* x264_t:
   *      opaque handler for encoder */
@@ -181,12 +181,15 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
  #define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
  #define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
  #define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_I444           0x0004  /* yuv 4:4:4 planar */
-#define X264_CSP_YV24           0x0005  /* yvu 4:4:4 planar */
-#define X264_CSP_BGR            0x0006  /* packed bgr 24bits   */
-#define X264_CSP_BGRA           0x0007  /* packed bgr 32bits   */
-#define X264_CSP_RGB            0x0008  /* packed rgb 24bits   */
-#define X264_CSP_MAX            0x0009  /* end of list */
+#define X264_CSP_I422           0x0004  /* yuv 4:2:2 planar */
+#define X264_CSP_YV16           0x0005  /* yvu 4:2:2 planar */
+#define X264_CSP_NV16           0x0006  /* yuv 4:2:2, with one y plane and one packed u+v */
+#define X264_CSP_I444           0x0007  /* yuv 4:4:4 planar */
+#define X264_CSP_YV24           0x0008  /* yvu 4:4:4 planar */
+#define X264_CSP_BGR            0x0009  /* packed bgr 24bits   */
+#define X264_CSP_BGRA           0x000a  /* packed bgr 32bits   */
+#define X264_CSP_RGB            0x000b  /* packed rgb 24bits   */
+#define X264_CSP_MAX            0x000c  /* end of list */
  #define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
  #define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
  
@@ -242,7 +245,7 @@ typedef struct x264_param_t
      /* Video Properties */
      int         i_width;
      int         i_height;
-    int         i_csp;  /* CSP of encoded bitstream, only i420 supported */
+    int         i_csp;         /* CSP of encoded bitstream */
      int         i_level_idc;
      int         i_frame_total; /* number of frames to encode if known, else 0 */
  
@@ -579,7 +582,7 @@ void    x264_param_apply_fastfirstpass( x264_param_t * );
  /* x264_param_apply_profile:
   *      Applies the restrictions of the given profile.
   *      Currently available profiles are, from most to least restrictive: */
-static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 };
+static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", "high422", "high444", 0 };
  
  /*      (can be NULL, in which case the function will do nothing)
   *
author	Henrik Gramner <hengar-6@student.ltu.se>
	Fri, 26 Aug 2011 13:57:04 +0000 (15:57 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 21 Sep 2011 16:54:44 +0000 (09:54 -0700)
AUTHORS		patch \| blob \| history
common/bitstream.h		patch \| blob \| history
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/dct.c		patch \| blob \| history
common/dct.h		patch \| blob \| history
common/deblock.c		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/pixel.h		patch \| blob \| history
common/predict.c		patch \| blob \| history
common/predict.h		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/quant.h		patch \| blob \| history
common/set.h		patch \| blob \| history
common/vlc.c		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/quant.h		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/cabac.c		patch \| blob \| history
encoder/cavlc.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/macroblock.h		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/rdo.c		patch \| blob \| history
encoder/set.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
filters/video/depth.c		patch \| blob \| history
filters/video/resize.c		patch \| blob \| history
input/avs.c		patch \| blob \| history
input/input.c		patch \| blob \| history
input/input.h		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history