x86: Add asm for mbtree fixed point conversion

[x264] / common / macroblock.c
diff --git a/common/macroblock.c b/common/macroblock.c

index 9f97aa0c047e1171e74b193942eb4162475c1718..9c05f005e13bad609786a36670aeec6e3c5d9108 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1,11 +1,12 @@
  /*****************************************************************************
   * macroblock.c: macroblock common functions
   *****************************************************************************
- * Copyright (C) 2003-2011 x264 project
+ * Copyright (C) 2003-2016 x264 project
   *
   * Authors: Fiona Glaser <fiona@x264.com>
   *          Laurent Aimar <fenrir@via.ecp.fr>
   *          Loren Merritt <lorenm@u.washington.edu>
+ *          Henrik Gramner <henrik@gramner.com>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -50,23 +51,27 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
      }
      else
      {
-        // chroma is offset if MCing from a field of opposite parity
-        if( MB_INTERLACED & i_ref )
+        int v_shift = CHROMA_V_SHIFT;
+        // Chroma in 4:2:0 is offset if MCing from a field of opposite parity
+        if( v_shift & MB_INTERLACED & i_ref )
              mvy += (h->mb.i_mb_y & 1)*4 - 2;
  
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        height = 4*height >> v_shift;
+
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                           h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, mvy, 2*width, 2*height );
+                         mvx, 2*mvy>>v_shift, 2*width, height );
  
          if( h->sh.weight[i_ref][1].weightfn )
-            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][1], height*2 );
+            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][1], height );
          if( h->sh.weight[i_ref][2].weightfn )
-            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][2],height*2 );
+            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][2], height );
      }
  }
  static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
@@ -85,13 +90,15 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
      }
      else
      {
-        if( MB_INTERLACED & i_ref )
+        int v_shift = CHROMA_V_SHIFT;
+        if( v_shift & MB_INTERLACED & i_ref )
              mvy += (h->mb.i_mb_y & 1)*4 - 2;
  
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                           h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, mvy, 2*width, 2*height );
+                         mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift );
      }
  }
  
@@ -114,9 +121,9 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
      int mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
      int mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
      int i_mode = x264_size2pixel[height][width];
-    int i_stride0 = 16, i_stride1 = 16;
-    ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
-    ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
+    intptr_t i_stride0 = 16, i_stride1 = 16;
+    ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
+    ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
      pixel *src0, *src1;
  
      MC_LUMA_BI( 0 );
@@ -128,17 +135,21 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
      }
      else
      {
-        if( MB_INTERLACED & i_ref0 )
+        int v_shift = CHROMA_V_SHIFT;
+        if( v_shift & MB_INTERLACED & i_ref0 )
              mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
-        if( MB_INTERLACED & i_ref1 )
+        if( v_shift & MB_INTERLACED & i_ref1 )
              mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
  
          h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
-                         mvx0, mvy0, 2*width, 2*height );
+                         mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift );
          h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
-                         mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
+                         mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift );
+
+        int chromapix = h->luma2chroma_pixel[i_mode];
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0,   16, tmp1,   16, weight );
+        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
      }
  }
  
@@ -245,24 +256,26 @@ int x264_macroblock_cache_allocate( x264_t *h )
  
      h->mb.b_interlaced = PARAM_INTERLACED;
  
-    CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
-    CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
-    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+    PREALLOC_INIT
+
+    PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
+    PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
+    PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
+    PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
  
      /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
+    PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
  
      /* all coeffs */
-    CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
+    PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
  
      if( h->param.b_cabac )
      {
-        CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
-        CHECKED_MALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
+        PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
+        PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
+        PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
+        if( h->param.i_bframe )
+            PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
      }
  
      for( int i = 0; i < 2; i++ )
@@ -272,11 +285,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
              i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
  
          for( int j = !i; j < i_refs; j++ )
-        {
-            CHECKED_MALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
-            M32( h->mb.mvr[i][j][0] ) = 0;
-            h->mb.mvr[i][j]++;
-        }
+            PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
      }
  
      if( h->param.analyse.i_weighted_pred )
@@ -300,7 +309,9 @@ int x264_macroblock_cache_allocate( x264_t *h )
          }
          else
          {
-            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv);
+            /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4
+             * needs the same amount of space and 4:2:2 needs twice that much */
+            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv);
  
              if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
                  //smart can weight one ref and one offset -1 in 8-bit
@@ -311,7 +322,24 @@ int x264_macroblock_cache_allocate( x264_t *h )
          }
  
          for( int i = 0; i < numweightbuf; i++ )
-            CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
+            PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
+    }
+
+    PREALLOC_END( h->mb.base );
+
+    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+
+    for( int i = 0; i < 2; i++ )
+    {
+        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
+
+        for( int j = !i; j < i_refs; j++ )
+        {
+            M32( h->mb.mvr[i][j][0] ) = 0;
+            h->mb.mvr[i][j]++;
+        }
      }
  
      return 0;
@@ -320,44 +348,32 @@ fail:
  }
  void x264_macroblock_cache_free( x264_t *h )
  {
-    for( int i = 0; i < 2; i++ )
-        for( int j = !i; j < X264_REF_MAX*2; j++ )
-            if( h->mb.mvr[i][j] )
-                x264_free( h->mb.mvr[i][j]-1 );
-    for( int i = 0; i < X264_REF_MAX; i++ )
-        x264_free( h->mb.p_weight_buf[i] );
-
-    if( h->param.b_cabac )
-    {
-        x264_free( h->mb.chroma_pred_mode );
-        x264_free( h->mb.mvd[0] );
-        x264_free( h->mb.mvd[1] );
-    }
-    x264_free( h->mb.slice_table );
-    x264_free( h->mb.intra4x4_pred_mode );
-    x264_free( h->mb.non_zero_count );
-    x264_free( h->mb.mb_transform_size );
-    x264_free( h->mb.skipbp );
-    x264_free( h->mb.cbp );
-    x264_free( h->mb.qp );
+    x264_free( h->mb.base );
  }
  
  int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
  {
      if( !b_lookahead )
      {
-        for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
+        for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ )
              for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
              {
-                /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
-                CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
+                CHECKED_MALLOC( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
                  h->intra_border_backup[i][j] += 16;
-                if( !PARAM_INTERLACED )
-                    h->intra_border_backup[1][j] = h->intra_border_backup[i][j];
              }
          for( int i = 0; i <= PARAM_INTERLACED; i++ )
          {
-            CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
+            if( h->param.b_sliced_threads )
+            {
+                /* Only allocate the first one, and allocate it for the whole frame, because we
+                 * won't be deblocking until after the frame is fully encoded. */
+                if( h == h->thread[0] && !i )
+                    CHECKED_MALLOC( h->deblock_strength[0], sizeof(**h->deblock_strength) * h->mb.i_mb_count );
+                else
+                    h->deblock_strength[i] = h->thread[0]->deblock_strength[0];
+            }
+            else
+                CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
              h->deblock_strength[1] = h->deblock_strength[i];
          }
      }
@@ -366,20 +382,25 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
      int scratch_size = 0;
      if( !b_lookahead )
      {
-        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
+        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t);
          int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
          int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
          int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
              ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
          scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
      }
-    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
+    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
      scratch_size = X264_MAX( scratch_size, buf_mbtree );
      if( scratch_size )
          CHECKED_MALLOC( h->scratch_buffer, scratch_size );
      else
          h->scratch_buffer = NULL;
  
+    int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
+    int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */
+    scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 );
+    CHECKED_MALLOC( h->scratch_buffer2, scratch_size );
+
      return 0;
  fail:
      return -1;
@@ -390,12 +411,14 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
      if( !b_lookahead )
      {
          for( int i = 0; i <= PARAM_INTERLACED; i++ )
-            x264_free( h->deblock_strength[i] );
-        for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
+            if( !h->param.b_sliced_threads || (h == h->thread[0] && !i) )
+                x264_free( h->deblock_strength[i] );
+        for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ )
              for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
                  x264_free( h->intra_border_backup[i][j] - 16 );
      }
      x264_free( h->scratch_buffer );
+    x264_free( h->scratch_buffer2 );
  }
  
  void x264_macroblock_slice_init( x264_t *h )
@@ -434,8 +457,6 @@ void x264_macroblock_slice_init( x264_t *h )
      }
      else if( h->sh.i_type == SLICE_TYPE_P )
      {
-        memset( h->mb.cache.skip, 0, sizeof( h->mb.cache.skip ) );
-
          if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
          {
              deblock_ref_table(-2) = -2;
@@ -491,6 +512,24 @@ void x264_macroblock_thread_init( x264_t *h )
                            (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
      h->mb.i_mb_prev_xy = -1;
  
+    /*          4:2:0                      4:2:2                      4:4:4
+     * fdec            fenc       fdec            fenc       fdec            fenc
+     * y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       U U V V    y Y Y Y Y       U U V V    y Y Y Y Y       U U U U
+     * u u u   v v v   U U V V    u u u   v v v   U U V V    u u u u u u u   U U U U
+     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
+     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
+     *                            u U U   v V V              u U U U U       V V V V
+     *                            u U U   v V V              u U U U U       V V V V
+     *                                                       v v v v v v v   V V V V
+     *                                                       v V V V V       V V V V
+     *                                                       v V V V V
+     *                                                       v V V V V
+     *                                                       v V V V V
+     */
      h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
      h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
      h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
@@ -500,16 +539,6 @@ void x264_macroblock_thread_init( x264_t *h )
          h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
          h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
      }
-    /* fdec:      fenc:
-     * yyyyyyy
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * uuu vvv    UUVV
-     * uUU vVV    UUVV
-     * uUU vVV
-     */
      else
      {
          h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
@@ -522,7 +551,7 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
      int stride_y  = fenc->i_stride[0];
      int stride_uv = fenc->i_stride[1];
      int off_y  = 16 * i_mb_x + 16 * i_mb_y * stride_y;
-    int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv;
+    int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> CHROMA_V_SHIFT);
      h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
                           fenc->plane[1]+off_uv, stride_uv, i_mb_x );
  }
@@ -537,14 +566,14 @@ NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
  static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
  {
      int mb_interlaced = b_mbaff && MB_INTERLACED;
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16 >> CHROMA_V_SHIFT : 16;
      int i_stride = h->fdec->i_stride[i];
      int i_stride2 = i_stride << mb_interlaced;
      int i_pix_offset = mb_interlaced
-                     ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + w * mb_y * i_stride;
+                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+                     : 16 * mb_x + height * mb_y * i_stride;
      pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-    int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
+    int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : !(mb_y&1);
      pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
      int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
      /* ref_pix_offset[0] references the current field and [1] the opposite field. */
@@ -554,25 +583,21 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
      h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
      if( b_chroma )
      {
-        h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
+        h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
          memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
          memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
-        if( b_mbaff )
-        {
-            h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
-            h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
-        }
+        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
+        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
      }
      else
      {
          h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
          memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
-        if( b_mbaff )
-            h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
+        h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
      }
-    if( b_mbaff )
+    if( b_mbaff || h->mb.b_reencode_mb )
      {
-        for( int j = 0; j < w; j++ )
+        for( int j = 0; j < height; j++ )
              if( b_chroma )
              {
                  h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
@@ -845,6 +870,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
  
      const x264_left_table_t *left_index_table = h->mb.left_index_table;
  
+    h->mb.cache.deblock_strength = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?h->mb.i_mb_xy:mb_x];
+
      /* load cache */
      if( h->mb.i_neighbour & MB_TOP )
      {
@@ -854,8 +881,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
  
          /* load non_zero_count */
          CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16+4 + 8*CHROMA444] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32+4 + 8*CHROMA444] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>CHROMA_V_SHIFT)] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );
  
          /* Finish the prefetching */
          for( int l = 0; l < lists; l++ )
@@ -906,16 +933,17 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
          h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];
  
-        if( CHROMA444 )
+        if( CHROMA_FORMAT >= CHROMA_422 )
          {
-            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32];
+            int offset = (4>>CHROMA_H_SHIFT) - 4;
+            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset];
          }
          else
          {
@@ -943,7 +971,7 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
          h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
          h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
-        if( CHROMA444 )
+        if( CHROMA_FORMAT >= CHROMA_422 )
          {
              h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
              h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
@@ -983,6 +1011,11 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          {
              x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
              x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE );
+                x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE );
+            }
              x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
          }
      }
@@ -1125,7 +1158,7 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
              {
                  // Looking at the bottom field so always take the bottom macroblock of the pair.
                  h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
-                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
+                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]];
                  h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]];
                  CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] );
                  CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] );
@@ -1222,8 +1255,13 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
          }
      }
  
-    if( b_mbaff && mb_x == 0 && !(mb_y&1) && mb_y > 0 )
-        h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_xy - h->mb.i_mb_stride];
+    if( b_mbaff && mb_x == 0 && !(mb_y&1) )
+    {
+        if( h->mb.i_mb_top_xy >= h->sh.i_first_mb )
+            h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_top_xy];
+        else
+            h->mb.field_decoding_flag = 0;
+    }
  
      /* Check whether skip here would cause decoder to predict interlace mode incorrectly.
       * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */
@@ -1231,26 +1269,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
      if( b_mbaff )
      {
          if( MB_INTERLACED != h->mb.field_decoding_flag &&
-            h->mb.i_mb_prev_xy >= 0 && IS_SKIP(h->mb.type[h->mb.i_mb_prev_xy]) )
+            (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
              h->mb.b_allow_skip = 0;
-        if( (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
-        {
-            if( h->mb.i_neighbour & MB_LEFT )
-            {
-                if( h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
-                    h->mb.b_allow_skip = 0;
-            }
-            else if( h->mb.i_neighbour & MB_TOP )
-            {
-                if( h->mb.field[h->mb.i_mb_top_xy] != MB_INTERLACED )
-                    h->mb.b_allow_skip = 0;
-            }
-            else // Frame mb pair is predicted
-            {
-                if( MB_INTERLACED )
-                    h->mb.b_allow_skip = 0;
-            }
-        }
      }
  
      if( h->param.b_cabac )
@@ -1397,10 +1417,8 @@ static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][
  
                  if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && h->mb.mb_transform_size[mbn_xy] )
                  {
-                    int nnz_top0 = M16( &nnz[mbn_xy][8] ) | M16( &nnz[mbn_xy][12] );
-                    int nnz_top1 = M16( &nnz[mbn_xy][10] ) | M16( &nnz[mbn_xy][14] );
-                    nnz_top[0] = nnz_top[1] = nnz_top0 ? 0x0101 : 0;
-                    nnz_top[2] = nnz_top[3] = nnz_top1 ? 0x0101 : 0;
+                    nnz_top[0] = nnz_top[1] = M16( &nnz[mbn_xy][ 8] ) || M16( &nnz[mbn_xy][12] );
+                    nnz_top[2] = nnz_top[3] = M16( &nnz[mbn_xy][10] ) || M16( &nnz[mbn_xy][14] );
                  }
  
                  for( int i = 0; i < 4; i++ )
@@ -1415,24 +1433,30 @@ static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][
  
  void x264_macroblock_deblock_strength( x264_t *h )
  {
-    uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
+    uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
      if( IS_INTRA( h->mb.i_type ) )
      {
-        memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
-        memset( bs[1][1], 3, 3*4*sizeof(uint8_t) );
+        M32( bs[0][1] ) = 0x03030303;
+        M64( bs[0][2] ) = 0x0303030303030303ULL;
+        M32( bs[1][1] ) = 0x03030303;
+        M64( bs[1][2] ) = 0x0303030303030303ULL;
          return;
      }
  
      /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
-    if( h->mb.b_transform_8x8 && (h->mb.i_cbp_luma&7) == 7 && !CHROMA444 )
+    if( h->mb.b_transform_8x8 && !CHROMA444 )
      {
-        M32( bs[0][0] ) = 0x02020202;
-        M32( bs[0][2] ) = 0x02020202;
-        M32( bs[0][4] ) = 0x02020202;
-        M32( bs[1][0] ) = 0x02020202;
-        M32( bs[1][2] ) = 0x02020202;
-        M32( bs[1][4] ) = 0x02020202;
-        return;
+        int cbp_mask = 0xf >> CHROMA_V_SHIFT;
+        if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
+        {
+            M32( bs[0][0] ) = 0x02020202;
+            M32( bs[0][2] ) = 0x02020202;
+            M32( bs[0][4] ) = 0x02020202;
+            M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */
+            M64( bs[1][2] ) = 0x0202020202020202ULL;
+            M32( bs[1][4] ) = 0x02020202;
+            return;
+        }
      }
  
      int neighbour_changed = 0;
@@ -1595,14 +1619,14 @@ void x264_macroblock_deblock_strength( x264_t *h )
  
  static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
  {
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
      int i_stride = h->fdec->i_stride[i];
      int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
      int i_pix_offset = (b_mbaff && MB_INTERLACED)
-                     ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + w * mb_y * i_stride;
+                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+                     : 16 * mb_x + height * mb_y * i_stride;
      if( b_chroma )
-        h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
+        h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height );
      else
          h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
  }
@@ -1613,7 +1637,7 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
       * For progressive mbs this is the bottom two rows, and for interlaced the
       * bottom row of each field. We also store samples needed for the next
       * mbpair in intra_border_backup[2]. */
-    int backup_dst = !b_mbaff ? 0 : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
+    int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
      memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
      if( CHROMA444 )
      {
@@ -1622,8 +1646,9 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
      }
      else
      {
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7,   8*sizeof(pixel) );
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7,   8*sizeof(pixel) );
+        int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
      }
      if( b_mbaff )
      {
@@ -1639,20 +1664,13 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
              }
              else
              {
-                backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
+                if( CHROMA_FORMAT == CHROMA_420 )
+                    backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
                  memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
                  memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
              }
          }
      }
-    else
-    {
-        /* In progressive we update intra_border_backup in-place, so the topleft neighbor will
-         * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
-        h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
-        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7 + 8*CHROMA444];
-        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7 + 8*CHROMA444];
-    }
  }
  
  void x264_macroblock_cache_save( x264_t *h )
@@ -1744,7 +1762,7 @@ void x264_macroblock_cache_save( x264_t *h )
      CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
      CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
      CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
-    if( CHROMA444 )
+    if( CHROMA_FORMAT >= CHROMA_422 )
      {
          CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
          CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
@@ -1809,7 +1827,7 @@ void x264_macroblock_cache_save( x264_t *h )
          uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
          uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
          if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
-            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
+            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
          else
              h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;