cosmetics

[x264] / encoder / analyse.c
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 99238651a639e8ec9f13e1a9f6567b2748344b6a..4e364a7a3da98b9049c06659de8877e3340245ec 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1,10 +1,11 @@
  /*****************************************************************************
   * analyse.c: h264 encoder library
   *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
+ * Copyright (C) 2003 x264 project
   * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   *
   * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -32,6 +33,8 @@
  #include "macroblock.h"
  #include "me.h"
  #include "ratecontrol.h"
+#include "analyse.h"
+#include "rdo.c"
  
  typedef struct
  {
@@ -69,13 +72,16 @@ typedef struct
  {
      /* conduct the analysis using this lamda and QP */
      int i_lambda;
+    int i_lambda2;
      int i_qp;
      int16_t *p_cost_mv;
+    int b_mbrd;
  
  
      /* I: Intra part */
      /* Take some shortcuts in intra search if intra is deemed unlikely */
      int b_fast_intra;
+    int i_best_satd;
  
      /* Luma part */
      int i_sad_i16x16;
@@ -111,6 +117,7 @@ typedef struct
  
  } x264_mb_analysis_t;
  
+/* lambda = pow(2,qp/6-2) */
  static const int i_qp0_cost_table[52] = {
     1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
     1, 1, 1, 1,              /*  8-11 */
@@ -121,6 +128,19 @@ static const int i_qp0_cost_table[52] = {
    40,45,51,57,64,72,81,91   /* 44-51 */
  };
  
+/* pow(lambda,2) * .9 */
+static const int i_qp0_cost2_table[52] = {
+   1,   1,   1,   1,   1,   1, /*  0-5  */
+   1,   1,   1,   1,   1,   1, /*  6-11 */
+   1,   1,   1,   2,   2,   3, /* 12-17 */
+   4,   5,   6,   7,   9,  11, /* 18-23 */
+  14,  18,  23,  29,  36,  46, /* 24-29 */
+  58,  73,  91, 115, 145, 183, /* 30-35 */
+ 230, 290, 366, 461, 581, 731, /* 36-41 */
+ 922,1161,1463,1843,2322,2926, /* 42-47 */
+3686,4645,5852,7373
+};
+
  static const uint8_t block_idx_x[16] = {
      0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
  };
@@ -132,8 +152,8 @@ static const uint8_t block_idx_y[16] = {
  static const int i_mb_b_cost_table[19] = {
      9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
  };
-static const int i_mb_b16x8_cost_table[16] = {
-    0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
+static const int i_mb_b16x8_cost_table[17] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
  };
  static const int i_sub_mb_b_cost_table[13] = {
      7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
@@ -142,6 +162,8 @@ static const int i_sub_mb_p_cost_table[4] = {
      5, 3, 3, 1
  };
  
+static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
+
  /* initialize an array of lambda*nbits for all possible mvs */
  static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
  {
@@ -171,12 +193,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
      /* conduct the analysis using this lamda and QP */
      a->i_qp = i_qp;
      a->i_lambda = i_qp0_cost_table[i_qp];
+    a->i_lambda2 = i_qp0_cost2_table[i_qp];
+    a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 && h->sh.i_type != SLICE_TYPE_B;
  
      h->mb.i_me_method = h->param.analyse.i_me_method;
      h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
      h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
                          && h->mb.i_subpel_refine >= 5;
-    a->b_fast_intra = 0;
  
      h->mb.b_transform_8x8 = 0;
  
@@ -186,6 +209,9 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
      a->i_sad_i4x4   =
      a->i_sad_i8x8chroma = COST_MAX;
  
+    a->b_fast_intra = 0;
+    a->i_best_satd = COST_MAX;
+
      /* II: Inter part P/B frame */
      if( h->sh.i_type != SLICE_TYPE_I )
      {
@@ -245,7 +271,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
          /* Fast intra decision */
          if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
          {
-            if(   IS_INTRA( h->mb.i_mb_type_left )
+            if( a->b_mbrd
+               || IS_INTRA( h->mb.i_mb_type_left )
                 || IS_INTRA( h->mb.i_mb_type_top )
                 || IS_INTRA( h->mb.i_mb_type_topleft )
                 || IS_INTRA( h->mb.i_mb_type_topright )
@@ -337,23 +364,21 @@ static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mod
  static void predict_4x4_mode_available( unsigned int i_neighbour,
                                          int *mode, int *pi_count )
  {
-    /* FIXME even when b_tr == 0 there is some case where missing pixels
-     * are emulated and thus more mode are available TODO
-     * analysis and encode should be fixed too */
      int b_l = i_neighbour & MB_LEFT;
      int b_t = i_neighbour & MB_TOP;
-    int b_tr = i_neighbour & MB_TOPRIGHT;
  
      if( b_l && b_t )
      {
          *mode++ = I_PRED_4x4_DC;
          *mode++ = I_PRED_4x4_H;
          *mode++ = I_PRED_4x4_V;
+        *mode++ = I_PRED_4x4_DDL;
          *mode++ = I_PRED_4x4_DDR;
          *mode++ = I_PRED_4x4_VR;
          *mode++ = I_PRED_4x4_HD;
+        *mode++ = I_PRED_4x4_VL;
          *mode++ = I_PRED_4x4_HU;
-        *pi_count = 7;
+        *pi_count = 9;
      }
      else if( b_l )
      {
@@ -366,34 +391,84 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,
      {
          *mode++ = I_PRED_4x4_DC_TOP;
          *mode++ = I_PRED_4x4_V;
-        *pi_count = 2;
+        *mode++ = I_PRED_4x4_DDL;
+        *mode++ = I_PRED_4x4_VL;
+        *pi_count = 4;
      }
      else
      {
          *mode++ = I_PRED_4x4_DC_128;
          *pi_count = 1;
      }
+}
  
-    if( b_t && b_tr )
+static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
+{
+    int i;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dstc[2], *p_srcc[2];
+    int      i_stride[2];
+
+    if( a->i_sad_i8x8chroma < COST_MAX )
+        return;
+
+    /* 8x8 prediction selection for chroma */
+    p_dstc[0] = h->mb.pic.p_fdec[1];
+    p_dstc[1] = h->mb.pic.p_fdec[2];
+    p_srcc[0] = h->mb.pic.p_fenc[1];
+    p_srcc[1] = h->mb.pic.p_fenc[2];
+
+    i_stride[0] = h->mb.pic.i_stride[1];
+    i_stride[1] = h->mb.pic.i_stride[2];
+
+    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    a->i_sad_i8x8chroma = COST_MAX;
+    for( i = 0; i < i_max; i++ )
      {
-        *mode++ = I_PRED_4x4_DDL;
-        *mode++ = I_PRED_4x4_VL;
-        (*pi_count) += 2;
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
+        h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
+
+        /* we calculate the cost */
+        i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], i_stride[0],
+                                          p_srcc[0], i_stride[0] ) +
+                h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], i_stride[1],
+                                          p_srcc[1], i_stride[1] ) +
+                a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
+
+        /* if i_score is lower it is better */
+        if( a->i_sad_i8x8chroma > i_sad )
+        {
+            a->i_predict8x8chroma = i_mode;
+            a->i_sad_i8x8chroma   = i_sad;
+        }
      }
+
+    h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
  }
  
-static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter )
+static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_inter )
  {
      const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
      const int i_stride = h->mb.pic.i_stride[0];
      uint8_t  *p_src = h->mb.pic.p_fenc[0];
      uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+    int      f8_satd_rd_ratio = 0;
  
      int i, idx;
-
      int i_max;
      int predict_mode[9];
  
+    const int i_satd_thresh = a->i_best_satd * 5/4 + a->i_lambda * 10;
+
      /*---------------- Try all mode and calculate their score ---------------*/
  
      /* 16x16 prediction selection */
@@ -404,34 +479,45 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
          int i_mode;
  
          i_mode = predict_mode[i];
-
-        /* we do the prediction */
          h->predict_16x16[i_mode]( p_dst, i_stride );
  
-        /* we calculate the diff and get the square sum of the diff */
-        i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
-                res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
-        /* if i_score is lower it is better */
-        if( res->i_sad_i16x16 > i_sad )
+        i_sad = h->pixf.mbcmp[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
+                a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
+        if( a->i_sad_i16x16 > i_sad )
          {
-            res->i_predict16x16 = i_mode;
-            res->i_sad_i16x16     = i_sad;
+            a->i_predict16x16 = i_mode;
+            a->i_sad_i16x16   = i_sad;
          }
      }
-    /* cavlc mb type prefix */
-    if( h->sh.i_type == SLICE_TYPE_B )
-        res->i_sad_i16x16 += res->i_lambda * i_mb_b_cost_table[I_16x16];
  
-    if( res->b_fast_intra )
+    if( a->b_mbrd )
+    {
+        f8_satd_rd_ratio = ((unsigned)i_cost_inter << 8) / a->i_best_satd + 1;
+        x264_mb_analyse_intra_chroma( h, a );
+        if( h->mb.b_chroma_me )
+            a->i_sad_i16x16 += a->i_sad_i8x8chroma;
+        if( a->i_sad_i16x16 < i_satd_thresh )
+        {
+            h->mb.i_type = I_16x16;
+            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
+            a->i_sad_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+        }
+        else
+            a->i_sad_i16x16 = a->i_sad_i16x16 * f8_satd_rd_ratio >> 8;
+    }
+    else
      {
-        if( res->i_sad_i16x16 > 2*i_cost_inter )
+        if( h->sh.i_type == SLICE_TYPE_B )
+            /* cavlc mb type prefix */
+            a->i_sad_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
+        if( a->b_fast_intra && a->i_sad_i16x16 > 2*i_cost_inter )
              return;
      }
  
      /* 4x4 prediction selection */
      if( flags & X264_ANALYSE_I4x4 )
      {
-        res->i_sad_i4x4 = 0;
+        a->i_sad_i4x4 = 0;
          for( idx = 0; idx < 16; idx++ )
          {
              uint8_t *p_src_by;
@@ -449,46 +535,62 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
  
              i_best = COST_MAX;
              predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+
+            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
+                /* emulate missing topright samples */
+                *(uint32_t*) &p_dst_by[4 - i_stride] = p_dst_by[3 - i_stride] * 0x01010101U;
+
              for( i = 0; i < i_max; i++ )
              {
                  int i_sad;
                  int i_mode;
  
                  i_mode = predict_mode[i];
-
-                /* we do the prediction */
                  h->predict_4x4[i_mode]( p_dst_by, i_stride );
  
-                /* we calculate diff and get the square sum of the diff */
-                i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
-                                                 p_src_by, i_stride );
-
-                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+                i_sad = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, i_stride,
+                                                  p_src_by, i_stride )
+                      + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
  
-                /* if i_score is lower it is better */
                  if( i_best > i_sad )
                  {
-                    res->i_predict4x4[x][y] = i_mode;
+                    a->i_predict4x4[x][y] = i_mode;
                      i_best = i_sad;
                  }
              }
-            res->i_sad_i4x4 += i_best;
+            a->i_sad_i4x4 += i_best;
  
              /* we need to encode this block now (for next ones) */
-            h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
-            x264_mb_encode_i4x4( h, idx, res->i_qp );
+            h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by, i_stride );
+            x264_mb_encode_i4x4( h, idx, a->i_qp );
  
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = res->i_predict4x4[x][y];
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
+        }
+
+        a->i_sad_i4x4 += a->i_lambda * 24;    /* from JVT (SATD0) */
+        if( a->b_mbrd )
+        {
+            if( h->mb.b_chroma_me )
+                a->i_sad_i4x4 += a->i_sad_i8x8chroma;
+            if( a->i_sad_i4x4 < i_satd_thresh )
+            {
+                h->mb.i_type = I_4x4;
+                a->i_sad_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
+            }
+            else
+                a->i_sad_i4x4 = a->i_sad_i4x4 * f8_satd_rd_ratio >> 8;
+        }
+        else
+        {
+            if( h->sh.i_type == SLICE_TYPE_B )
+                a->i_sad_i4x4 += a->i_lambda * i_mb_b_cost_table[I_4x4];
          }
-        res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
-        if( h->sh.i_type == SLICE_TYPE_B )
-            res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
      }
  
      /* 8x8 prediction selection */
      if( flags & X264_ANALYSE_I8x8 )
      {
-        res->i_sad_i8x8 = 0;
+        a->i_sad_i8x8 = 0;
          for( idx = 0; idx < 4; idx++ )
          {
              uint8_t *p_src_by;
@@ -514,82 +616,45 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
                  i_mode = predict_mode[i];
                  h->predict_8x8[i_mode]( p_dst_by, i_stride, h->mb.i_neighbour );
  
-                i_sad = h->pixf.satd[PIXEL_8x8]( p_dst_by, i_stride,
-                                                 p_src_by, i_stride );
-
-                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+                /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
+                i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, i_stride,
+                                                  p_src_by, i_stride )
+                      + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
  
                  if( i_best > i_sad )
                  {
-                    res->i_predict8x8[x][y] = i_mode;
+                    a->i_predict8x8[x][y] = i_mode;
                      i_best = i_sad;
                  }
              }
-            res->i_sad_i8x8 += i_best;
+            a->i_sad_i8x8 += i_best;
  
              /* we need to encode this block now (for next ones) */
-            h->predict_8x8[res->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
-            x264_mb_encode_i8x8( h, idx, res->i_qp );
+            h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
+            x264_mb_encode_i8x8( h, idx, a->i_qp );
  
-            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, res->i_predict4x4[x][y] );
+            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
          }
-//      res->i_sad_i8x8 += res->i_lambda * something;    // FIXME
-        if( h->sh.i_type == SLICE_TYPE_B )
-            res->i_sad_i8x8 += res->i_lambda * i_mb_b_cost_table[I_8x8];
-    }
-}
-
-static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
-{
-    int i;
-
-    int i_max;
-    int predict_mode[9];
-
-    uint8_t *p_dstc[2], *p_srcc[2];
-    int      i_stride[2];
-
-    if( res->i_sad_i8x8chroma < COST_MAX )
-        return;
-
-    /* 8x8 prediction selection for chroma */
-    p_dstc[0] = h->mb.pic.p_fdec[1];
-    p_dstc[1] = h->mb.pic.p_fdec[2];
-    p_srcc[0] = h->mb.pic.p_fenc[1];
-    p_srcc[1] = h->mb.pic.p_fenc[2];
  
-    i_stride[0] = h->mb.pic.i_stride[1];
-    i_stride[1] = h->mb.pic.i_stride[2];
-
-    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-    res->i_sad_i8x8chroma = COST_MAX;
-    for( i = 0; i < i_max; i++ )
-    {
-        int i_sad;
-        int i_mode;
-
-        i_mode = predict_mode[i];
-
-        /* we do the prediction */
-        h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
-        h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
-
-        /* we calculate the cost */
-        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
-                                         p_srcc[0], i_stride[0] ) +
-                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
-                                         p_srcc[1], i_stride[1] ) +
-                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
-
-        /* if i_score is lower it is better */
-        if( res->i_sad_i8x8chroma > i_sad )
+        if( a->b_mbrd )
+        {
+            if( h->mb.b_chroma_me )
+                a->i_sad_i8x8 += a->i_sad_i8x8chroma;
+            if( a->i_sad_i8x8 < i_satd_thresh )
+            {
+                h->mb.i_type = I_8x8;
+                a->i_sad_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+            }
+            else
+                a->i_sad_i8x8 = a->i_sad_i8x8 * f8_satd_rd_ratio >> 8;
+        }
+        else
          {
-            res->i_predict8x8chroma = i_mode;
-            res->i_sad_i8x8chroma   = i_sad;
+            // FIXME some bias like in i4x4?
+            if( h->sh.i_type == SLICE_TYPE_B )
+                a->i_sad_i8x8 += a->i_lambda * i_mb_b_cost_table[I_8x8];
          }
      }
-
-    h->mb.i_chroma_pred_mode = res->i_predict8x8chroma;
  }
  
  #define LOAD_FENC( m, src, xoff, yoff) \
@@ -645,11 +710,22 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
          h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
      }
  
-    /* subtract ref cost, so we don't have to add it for the other P types */
-    a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
-
      /* Set global ref, needed for all others modes */
      x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+
+    if( a->b_mbrd )
+    {
+        a->i_best_satd = a->l0.me16x16.cost;
+        h->mb.i_type = P_L0;
+        h->mb.i_partition = D_16x16;
+        x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+        a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+    {
+        /* subtract ref cost, so we don't have to add it for the other P types */
+        a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+    }
  }
  
  static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
@@ -692,7 +768,16 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
      }
  
      a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
-                   a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost8x8 )
+            a->i_best_satd = a->l0.i_cost8x8;
+        h->mb.i_type = P_8x8;
+        h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+        h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
+        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
  }
  
  static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
@@ -727,6 +812,13 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
      }
  
      a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost16x8 )
+            a->i_best_satd = a->l0.i_cost16x8;
+        h->mb.i_type = P_L0;
+        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
  }
  
  static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
@@ -761,6 +853,13 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
      }
  
      a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost8x16 )
+            a->i_best_satd = a->l0.i_cost8x16;
+        h->mb.i_type = P_L0;
+        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
  }
  
  static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
@@ -791,8 +890,8 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
          CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
      }
  
-    return h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
-         + h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
+    return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
+         + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
  }
  
  static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
@@ -927,7 +1026,7 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
          const int off = 8 * x8 + 8 * i_stride * y8;
          a->i_cost16x16direct +=
          a->i_cost8x8direct[i] =
-            h->pixf.satd[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
+            h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
  
          /* mb type cost */
          a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
@@ -939,10 +1038,10 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
  #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
      { \
          if( h->param.analyse.b_weighted_bipred ) \
-            h->pixf.avg_weight[size]( pix1, stride1, src2, stride2, \
+            h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \
                      h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
          else \
-            h->pixf.avg[size]( pix1, stride1, src2, stride2 ); \
+            h->mc.avg[size]( pix1, stride1, src2, stride2 ); \
      }
  
  static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
@@ -1052,12 +1151,12 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
      }
  
      if( h->param.analyse.b_weighted_bipred )
-        h->pixf.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
+        h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
                  h->mb.bipred_weight[pix1_ref][src2_ref] );
      else
-        h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
+        h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
  
-    a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
+    a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
                       + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
                                       + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
                       + a->l0.me16x16.cost_mv
@@ -1069,6 +1168,36 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
      a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
  }
  
+static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
+{
+    const int x = 2*(i%2);
+    const int y = 2*(i/2);
+
+    switch( h->mb.i_sub_partition[i] )
+    {
+        case D_L0_8x8:
+            x264_macroblock_cache_mv( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] );
+            break;
+        case D_L0_8x4:
+            x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv[0], a->l0.me8x4[i][0].mv[1] );
+            x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv[0], a->l0.me8x4[i][1].mv[1] );
+            break;
+        case D_L0_4x8:
+            x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv[0], a->l0.me4x8[i][0].mv[1] );
+            x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv[0], a->l0.me4x8[i][1].mv[1] );
+            break;
+        case D_L0_4x4:
+            x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv[0], a->l0.me4x4[i][0].mv[1] );
+            x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv[0], a->l0.me4x4[i][1].mv[1] );
+            x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv[0], a->l0.me4x4[i][2].mv[1] );
+            x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv[0], a->l0.me4x4[i][3].mv[1] );
+            break;
+        default:
+            x264_log( h, X264_LOG_ERROR, "internal error\n" );
+            break;
+    }
+}
+
  #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
      if( x264_mb_partition_listX_table[0][part] ) \
      { \
@@ -1168,7 +1297,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
          }
  
          WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
-        i_part_cost_bi += h->pixf.satd[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
+        i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
                          + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
          a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
          a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
@@ -1245,7 +1374,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
          }
  
          WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
-        i_part_cost_bi += h->pixf.satd[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
+        i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
  
          i_part_cost = a->l0.me16x8[i].cost;
          a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
@@ -1315,7 +1444,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
          }
  
          WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
-        i_part_cost_bi += h->pixf.satd[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
+        i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
  
          i_part_cost = a->l0.me8x16[i].cost;
          a->i_mb_partition8x16[i] = D_L0_8x8;
@@ -1342,6 +1471,52 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
      a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
  }
  
+static inline void x264_mb_analyse_transform( x264_t *h )
+{
+    h->mb.cache.b_transform_8x8_allowed =
+        h->param.analyse.b_transform_8x8
+        && !IS_INTRA( h->mb.i_type ) && x264_mb_transform_8x8_allowed( h );
+
+    if( h->mb.cache.b_transform_8x8_allowed )
+    {
+        int i_cost4, i_cost8;
+        /* FIXME only luma mc is needed */
+        x264_mb_mc( h );
+
+        i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                             h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+        i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                             h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        h->mb.b_transform_8x8 = i_cost8 < i_cost4;
+    }
+}
+
+static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_cost )
+{
+    h->mb.cache.b_transform_8x8_allowed =
+        h->param.analyse.b_transform_8x8 && x264_mb_transform_8x8_allowed( h );
+
+    if( h->mb.cache.b_transform_8x8_allowed )
+    {
+        int i_cost8;
+        x264_analyse_update_cache( h, a );
+        h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+        /* FIXME only luma is needed, but the score for comparison already includes chroma */
+        i_cost8 = x264_rd_cost_mb( h, a->i_lambda2 );
+
+        if( *i_cost >= i_cost8 )
+        {
+            if( *i_cost > 0 )
+                a->i_best_satd = (int64_t)a->i_best_satd * i_cost8 / *i_cost;
+            *i_cost = i_cost8;
+        }
+        else
+            h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+    }
+}
+
+
  /*****************************************************************************
   * x264_macroblock_analyse:
   *****************************************************************************/
@@ -1350,11 +1525,8 @@ void x264_macroblock_analyse( x264_t *h )
      x264_mb_analysis_t analysis;
      int i;
  
-    h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp(h);
-
-    /* prevent QP from varying too fast. FIXME what's a sane limit? */
-    h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->mb.qp[h->mb.i_mb_xy],
-                                          h->mb.i_last_qp - 12, h->mb.i_last_qp + 12 );
+    h->mb.i_qp =
+    h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp( h );
  
      /* init analysis */
      x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
@@ -1382,10 +1554,11 @@ void x264_macroblock_analyse( x264_t *h )
          int i_intra_cost, i_intra_type;
  
          /* Fast P_SKIP detection */
-        if( ( h->mb.i_mb_type_left == P_SKIP ) ||
+        if( !h->mb.b_lossless &&
+           (( h->mb.i_mb_type_left == P_SKIP ) ||
              ( h->mb.i_mb_type_top == P_SKIP ) ||
              ( h->mb.i_mb_type_topleft == P_SKIP ) ||
-            ( h->mb.i_mb_type_topright == P_SKIP ) )
+            ( h->mb.i_mb_type_topright == P_SKIP )))
          {
              b_skip = x264_macroblock_probe_pskip( h );
          }
@@ -1400,6 +1573,7 @@ void x264_macroblock_analyse( x264_t *h )
              const unsigned int flags = h->param.analyse.inter;
              int i_type;
              int i_partition;
+            int i_thresh16x8;
  
              x264_mb_analyse_load_costs( h, &analysis );
  
@@ -1419,46 +1593,64 @@ void x264_macroblock_analyse( x264_t *h )
  
                  i_type = P_8x8;
                  i_partition = D_8x8;
-                h->mb.i_sub_partition[0] = D_L0_8x8;
-                h->mb.i_sub_partition[1] = D_L0_8x8;
-                h->mb.i_sub_partition[2] = D_L0_8x8;
-                h->mb.i_sub_partition[3] = D_L0_8x8;
+                h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+                h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
  
                  i_cost = analysis.l0.i_cost8x8;
  
                  /* Do sub 8x8 */
                  if( flags & X264_ANALYSE_PSUB8x8 )
                  {
+                    int i_cost_bak = i_cost;
+                    int b_sub8x8 = 0;
                      for( i = 0; i < 4; i++ )
                      {
                          x264_mb_analyse_inter_p4x4( h, &analysis, i );
                          if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
                          {
-                            int i_cost8x8;
-
+                            int i_cost8x8 = analysis.l0.i_cost4x4[i];
                              h->mb.i_sub_partition[i] = D_L0_4x4;
-                            i_cost8x8 = analysis.l0.i_cost4x4[i];
  
                              x264_mb_analyse_inter_p8x4( h, &analysis, i );
-                            if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
+                            if( analysis.l0.i_cost8x4[i] < i_cost8x8 )
                              {
                                  h->mb.i_sub_partition[i] = D_L0_8x4;
                                  i_cost8x8 = analysis.l0.i_cost8x4[i];
                              }
  
                              x264_mb_analyse_inter_p4x8( h, &analysis, i );
-                            if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
+                            if( analysis.l0.i_cost4x8[i] < i_cost8x8 )
                              {
                                  h->mb.i_sub_partition[i] = D_L0_4x8;
                                  i_cost8x8 = analysis.l0.i_cost4x8[i];
                              }
  
                              i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
+                            b_sub8x8 = 1;
+                        }
+                        x264_mb_cache_mv_p8x8( h, &analysis, i );
+                    }
+                    /* TODO: RD per subpartition */
+                    if( b_sub8x8 && analysis.b_mbrd )
+                    {
+                        i_cost = x264_rd_cost_mb( h, analysis.i_lambda2 );
+                        if( i_cost > i_cost_bak )
+                        {
+                            i_cost = i_cost_bak;
+                            h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+                            h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
                          }
                      }
                  }
+            }
  
-                /* Now do sub 16x8/8x16 */
+            /* Now do 16x8/8x16 */
+            i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
+            if( analysis.b_mbrd )
+                i_thresh16x8 = i_thresh16x8 * analysis.i_lambda2 / analysis.i_lambda;
+            if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
+                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
+            {
                  x264_mb_analyse_inter_p16x8( h, &analysis );
                  if( analysis.l0.i_cost16x8 < i_cost )
                  {
@@ -1476,28 +1668,33 @@ void x264_macroblock_analyse( x264_t *h )
                  }
              }
  
-            h->mb.i_type = i_type;
              h->mb.i_partition = i_partition;
  
              /* refine qpel */
-            if( h->mb.i_partition == D_16x16 )
+            //FIXME mb_type costs?
+            if( analysis.b_mbrd )
+            {
+                h->mb.i_type = i_type;
+                x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
+            }
+            else if( i_partition == D_16x16 )
              {
                  x264_me_refine_qpel( h, &analysis.l0.me16x16 );
                  i_cost = analysis.l0.me16x16.cost;
              }
-            else if( h->mb.i_partition == D_16x8 )
+            else if( i_partition == D_16x8 )
              {
                  x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
                  x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
                  i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
              }
-            else if( h->mb.i_partition == D_8x16 )
+            else if( i_partition == D_8x16 )
              {
                  x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
                  x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
                  i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
              }
-            else if( h->mb.i_partition == D_8x8 )
+            else if( i_partition == D_8x8 )
              {
                  int i8x8;
                  i_cost = 0;
@@ -1540,18 +1737,25 @@ void x264_macroblock_analyse( x264_t *h )
              }
  
              x264_mb_analyse_intra( h, &analysis, i_cost );
-            if( h->mb.b_chroma_me &&
+            if( h->mb.b_chroma_me && !analysis.b_mbrd &&
                  ( analysis.i_sad_i16x16 < i_cost
-             || ( analysis.i_sad_i4x4 < i_cost )))
+               || analysis.i_sad_i8x8 < i_cost
+               || analysis.i_sad_i4x4 < i_cost ))
              {
                  x264_mb_analyse_intra_chroma( h, &analysis );
                  analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma;
+                analysis.i_sad_i8x8 += analysis.i_sad_i8x8chroma;
                  analysis.i_sad_i4x4 += analysis.i_sad_i8x8chroma;
              }
  
              i_intra_type = I_16x16;
              i_intra_cost = analysis.i_sad_i16x16;
  
+            if( analysis.i_sad_i8x8 < i_intra_cost )
+            {
+                i_intra_type = I_8x8;
+                i_intra_cost = analysis.i_sad_i8x8;
+            }
              if( analysis.i_sad_i4x4 < i_intra_cost )
              {
                  i_intra_type = I_4x4;
@@ -1560,10 +1764,11 @@ void x264_macroblock_analyse( x264_t *h )
  
              if( i_intra_cost < i_cost )
              {
-                h->mb.i_type = i_intra_type;
+                i_type = i_intra_type;
                  i_cost = i_intra_cost;
              }
  
+            h->mb.i_type = i_type;
              h->stat.frame.i_intra_cost += i_intra_cost;
              h->stat.frame.i_inter_cost += i_cost;
          }
@@ -1580,7 +1785,7 @@ void x264_macroblock_analyse( x264_t *h )
  
              /* Conditioning the probe on neighboring block types
               * doesn't seem to help speed or quality. */
-            b_skip = x264_macroblock_probe_bskip( h );
+            b_skip = !h->mb.b_lossless && x264_macroblock_probe_bskip( h );
          }
  
          if( !b_skip )
@@ -1740,6 +1945,11 @@ void x264_macroblock_analyse( x264_t *h )
                  h->mb.i_type = I_16x16;
                  i_cost = analysis.i_sad_i16x16;
              }
+            if( analysis.i_sad_i8x8 < i_cost )
+            {
+                h->mb.i_type = I_8x8;
+                i_cost = analysis.i_sad_i8x8;
+            }
              if( analysis.i_sad_i4x4 < i_cost )
              {
                  h->mb.i_type = I_4x4;
@@ -1748,48 +1958,56 @@ void x264_macroblock_analyse( x264_t *h )
          }
      }
  
-    /*-------------------- Update MB from the analysis ----------------------*/
-    h->mb.type[h->mb.i_mb_xy] = x264_mb_type_fix[h->mb.i_type];
+    x264_analyse_update_cache( h, &analysis );
+
+    if( !analysis.b_mbrd )
+        x264_mb_analyse_transform( h );
+}
+
+/*-------------------- Update MB from the analysis ----------------------*/
+static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
+{
+    int i;
+
      switch( h->mb.i_type )
      {
          case I_4x4:
              for( i = 0; i < 16; i++ )
              {
                  h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
-                    analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
+                    a->i_predict4x4[block_idx_x[i]][block_idx_y[i]];
              }
  
-            x264_mb_analyse_intra_chroma( h, &analysis );
+            x264_mb_analyse_intra_chroma( h, a );
              break;
          case I_8x8:
-            h->mb.b_transform_8x8 = 1;
              for( i = 0; i < 4; i++ )
                  x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1),
-                    analysis.i_predict8x8[i&1][i>>1] );
+                    a->i_predict8x8[i&1][i>>1] );
  
-            x264_mb_analyse_intra_chroma( h, &analysis );
+            x264_mb_analyse_intra_chroma( h, a );
              break;
          case I_16x16:
-            h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
-            x264_mb_analyse_intra_chroma( h, &analysis );
+            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
+            x264_mb_analyse_intra_chroma( h, a );
              break;
  
          case P_L0:
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
              switch( h->mb.i_partition )
              {
                  case D_16x16:
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
                      break;
  
                  case D_16x8:
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
-                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] );
                      break;
  
                  case D_8x16:
-                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
-                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] );
                      break;
  
                  default:
@@ -1799,36 +2017,9 @@ void x264_macroblock_analyse( x264_t *h )
              break;
  
          case P_8x8:
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
              for( i = 0; i < 4; i++ )
-            {
-                const int x = 2*(i%2);
-                const int y = 2*(i/2);
-
-                switch( h->mb.i_sub_partition[i] )
-                {
-                    case D_L0_8x8:
-                        x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
-                        break;
-                    case D_L0_8x4:
-                        x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
-                        x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
-                        break;
-                    case D_L0_4x8:
-                        x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
-                        x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
-                        break;
-                    case D_L0_4x4:
-                        x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
-                        x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
-                        x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
-                        x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
-                        break;
-                    default:
-                        x264_log( h, X264_LOG_ERROR, "internal error\n" );
-                        break;
-                }
-            }
+                x264_mb_cache_mv_p8x8( h, a, i );
              break;
  
          case P_SKIP:
@@ -1855,7 +2046,7 @@ void x264_macroblock_analyse( x264_t *h )
          case B_8x8:
              /* optimize: cache might not need to be rewritten */
              for( i = 0; i < 4; i++ )
-                x264_mb_cache_mv_b8x8( h, &analysis, i, 1 );
+                x264_mb_cache_mv_b8x8( h, a, i, 1 );
              break;
  
          default: /* the rest of the B types */
@@ -1865,8 +2056,8 @@ void x264_macroblock_analyse( x264_t *h )
                  switch( h->mb.i_type )
                  {
                  case B_L0_L0:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
  
                      x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
                      x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
@@ -1877,25 +2068,25 @@ void x264_macroblock_analyse( x264_t *h )
                      x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
                      x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
  
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
                      break;
                  case B_BI_BI:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
  
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
                      break;
                  }
                  break;
              case D_16x8:
-                x264_mb_cache_mv_b16x8( h, &analysis, 0, 1 );
-                x264_mb_cache_mv_b16x8( h, &analysis, 1, 1 );
+                x264_mb_cache_mv_b16x8( h, a, 0, 1 );
+                x264_mb_cache_mv_b16x8( h, a, 1, 1 );
                  break;
              case D_8x16:
-                x264_mb_cache_mv_b8x16( h, &analysis, 0, 1 );
-                x264_mb_cache_mv_b8x16( h, &analysis, 1, 1 );
+                x264_mb_cache_mv_b8x16( h, a, 0, 1 );
+                x264_mb_cache_mv_b8x16( h, a, 1, 1 );
                  break;
              default:
                  x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );