Fix disabling of progress with --log-level

[x264] / encoder / analyse.c
diff --git a/encoder/analyse.c b/encoder/analyse.c

index f14a19b5340acee329eeef4d624fb41e1334e4e0..44543731ff1d5feb89a931e2a5dc9c8e770e37dc 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -134,25 +134,27 @@ typedef struct
  } x264_mb_analysis_t;
  
  /* lambda = pow(2,qp/6-2) */
-const uint8_t x264_lambda_tab[52] = {
-   1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
-   1, 1, 1, 1,              /*  8-11 */
-   1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
-   3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
-   6, 7, 8, 9,10,11,13,14,  /* 28-35 */
-  16,18,20,23,25,29,32,36,  /* 36-43 */
-  40,45,51,57,64,72,81,91   /* 44-51 */
+const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = {
+   1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
+   1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
+   2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
+   4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
+  10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
+  25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
+  64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
+ 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
  };
  
  /* lambda2 = pow(lambda,2) * .9 * 256 */
-const int x264_lambda2_tab[52] = {
-    14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
-    91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
-   580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
-  3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
- 23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
-148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
-943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
+const int x264_lambda2_tab[QP_MAX_MAX+1] = {
+     14,     18,     22,      28,      36,      45,      57,      72, /*  0- 7 */
+     91,    115,    145,     182,     230,     290,     365,     460, /*  8-15 */
+    580,    731,    921,    1161,    1462,    1843,    2322,    2925, /* 16-23 */
+   3686,   4644,   5851,    7372,    9289,   11703,   14745,   18578, /* 24-31 */
+  23407,  29491,  37156,   46814,   58982,   74313,   93628,  117964, /* 32-39 */
+ 148626, 187257, 235929,  297252,  374514,  471859,  594505,  749029, /* 40-47 */
+ 943718,1189010,1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
+5992238,7549747,9512085,11984476,15099494,19024170,23968953,30198988, /* 56-63 */
  };
  
  const uint8_t x264_exp2_lut[64] = {
@@ -188,27 +190,31 @@ const float x264_log2_lz_lut[32] = {
  
  // should the intra and inter lambdas be different?
  // I'm just matching the behaviour of deadzone quant.
-static const int x264_trellis_lambda2_tab[2][52] = {
+static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = {
      // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    46,      58,      73,      92,     117,     147,
-        185,     233,     294,     370,     466,     587,
-        740,     932,    1174,    1480,    1864,    2349,
-       2959,    3728,    4697,    5918,    7457,    9395,
-      11837,   14914,   18790,   23674,   29828,   37581,
-      47349,   59656,   75163,   94699,  119313,  150326,
-     189399,  238627,  300652,  378798,  477255,  601304,
-     757596,  954511, 1202608, 1515192, 1909022, 2405217,
-    3030384, 3818045, 4810435, 6060769 },
+    {      46,      58,      73,      92,     117,     147,
+          185,     233,     294,     370,     466,     587,
+          740,     932,    1174,    1480,    1864,    2349,
+         2959,    3728,    4697,    5918,    7457,    9395,
+        11837,   14914,   18790,   23674,   29828,   37581,
+        47349,   59656,   75163,   94699,  119313,  150326,
+       189399,  238627,  300652,  378798,  477255,  601304,
+       757596,  954511, 1202608, 1515192, 1909022, 2405217,
+      3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
+     12121539,15272182,19241743,24243077,30544363,38483486,
+     48486154,61088726,76966972,96972308 },
      // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    27,      34,      43,      54,      68,      86,
-        108,     136,     172,     216,     273,     343,
-        433,     545,     687,     865,    1090,    1374,
-       1731,    2180,    2747,    3461,    4361,    5494,
-       6922,    8721,   10988,   13844,   17442,   21976,
-      27688,   34885,   43953,   55377,   69771,   87906,
-     110755,  139543,  175813,  221511,  279087,  351627,
-     443023,  558174,  703255,  886046, 1116348, 1406511,
-    1772093, 2232697, 2813022, 3544186 }
+    {      27,      34,      43,      54,      68,      86,
+          108,     136,     172,     216,     273,     343,
+          433,     545,     687,     865,    1090,    1374,
+         1731,    2180,    2747,    3461,    4361,    5494,
+         6922,    8721,   10988,   13844,   17442,   21976,
+        27688,   34885,   43953,   55377,   69771,   87906,
+       110755,  139543,  175813,  221511,  279087,  351627,
+       443023,  558174,  703255,  886046, 1116348, 1406511,
+      1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
+      7088374, 8930791,11252092,14176748,17861583,22504184,
+     28353495,35723165,45008368,56706990 }
  };
  
  static const uint16_t x264_chroma_lambda2_offset_tab[] = {
@@ -237,7 +243,7 @@ static const uint8_t i_sub_mb_p_cost_table[4] = {
  
  static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
  
-static uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[LAMBDA_MAX+1][3][33];
  static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
  
  int x264_analyse_init_costs( x264_t *h, int qp )
@@ -275,7 +281,7 @@ fail:
  
  void x264_analyse_free_costs( x264_t *h )
  {
-    for( int i = 0; i < 92; i++ )
+    for( int i = 0; i < LAMBDA_MAX+1; i++ )
      {
          if( h->cost_mv[i] )
              x264_free( h->cost_mv[i] - 2*4*2048 );
@@ -295,7 +301,7 @@ void x264_analyse_weight_frame( x264_t *h, int end )
              int width = frame->i_width[0] + 2*PADH;
              int i_padv = PADV << h->param.b_interlaced;
              int offset, height;
-            uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+            pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
              height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
              offset = h->fenc->i_lines_weighted*frame->i_stride[0];
              h->fenc->i_lines_weighted += height;
@@ -303,7 +309,7 @@ void x264_analyse_weight_frame( x264_t *h, int end )
                  for( int k = j; k < h->i_ref0; k++ )
                      if( h->sh.weight[k][0].weightfn )
                      {
-                        uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
                          x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
                                                   src + offset, frame->i_stride[0],
                                                   width, height, &h->sh.weight[k][0] );
@@ -550,7 +556,7 @@ static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
  /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
  static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
  {
-    ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
+    ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
  
      if( do_both_dct || h->mb.b_transform_8x8 )
          h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
@@ -573,15 +579,13 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
  
  static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
  {
-    int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
-
      if( a->i_satd_i8x8chroma < COST_MAX )
          return;
  
      const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
  
      /* 8x8 prediction selection for chroma */
-    if( predict_mode[3] >= 0 && b_merged_satd )
+    if( predict_mode[3] >= 0 && !h->mb.b_lossless )
      {
          int satdu[4], satdv[4];
          h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
@@ -632,27 +636,43 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
  static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
  {
      const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
-    uint8_t  *p_src = h->mb.pic.p_fenc[0];
-    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+    pixel *p_src = h->mb.pic.p_fenc[0];
+    pixel *p_dst = h->mb.pic.p_fdec[0];
+    static const int8_t intra_analysis_shortcut[2][2][5] =
+    {{{I_PRED_4x4_HU, -1},
+      {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1}},
+     {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1},
+      {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}};
  
      int idx;
-    int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
+    int lambda = a->i_lambda;
  
      /*---------------- Try all mode and calculate their score ---------------*/
  
      /* 16x16 prediction selection */
      const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
  
-    if( b_merged_satd && predict_mode[3] >= 0 )
+    /* Not heavily tuned */
+    static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
+    int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
+
+    if( !h->mb.b_lossless && predict_mode[3] >= 0 )
      {
          h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
-        h->predict_16x16[I_PRED_16x16_P]( p_dst );
-        a->i_satd_i16x16_dir[I_PRED_16x16_P] =
-            h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
-        for( int i = 0; i < 4; i++ )
+        a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
+        a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
+        a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
+        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
+        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
+        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
+
+        /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
+        if( a->i_satd_i16x16 <= i16x16_thresh )
          {
-            int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
-            COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
+            h->predict_16x16[I_PRED_16x16_P]( p_dst );
+            a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+            a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
+            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
          }
      }
      else
@@ -668,7 +688,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                  h->predict_16x16[i_mode]( p_dst );
  
              i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
-                    a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
+                     lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
              COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
              a->i_satd_i16x16_dir[i_mode] = i_satd;
          }
@@ -676,51 +696,56 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
  
      if( h->sh.i_type == SLICE_TYPE_B )
          /* cavlc mb type prefix */
-        a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
+        a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
  
-    /* Not heavily tuned */
-    const uint8_t i16x16_thresh[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
-    if( a->b_fast_intra && a->i_satd_i16x16 > (i16x16_thresh[h->mb.i_subpel_refine]*i_satd_inter)>>1 )
+    if( a->i_satd_i16x16 > i16x16_thresh )
          return;
  
      /* 8x8 prediction selection */
      if( flags & X264_ANALYSE_I8x8 )
      {
-        ALIGNED_ARRAY_16( uint8_t, edge,[33] );
+        ALIGNED_ARRAY_16( pixel, edge,[33] );
          x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
          int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
  
          // FIXME some bias like in i4x4?
-        int i_cost = a->i_lambda * 4; /* base predmode costs */
+        int i_cost = lambda * 4; /* base predmode costs */
          h->mb.i_cbp_luma = 0;
-        b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
  
          if( h->sh.i_type == SLICE_TYPE_B )
-            i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
+            i_cost += lambda * i_mb_b_cost_table[I_8x8];
  
          for( idx = 0;; idx++ )
          {
              int x = idx&1;
              int y = idx>>1;
-            uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
-            uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
+            pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
+            pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
              int i_best = COST_MAX;
              int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
  
              predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
              h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
  
-            if( b_merged_satd && predict_mode[8] >= 0 )
+            if( !h->mb.b_lossless && predict_mode[5] >= 0 )
              {
                  int satd[9];
                  h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
-                satd[i_pred_mode] -= 3 * a->i_lambda;
+                int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
+                satd[i_pred_mode] -= 3 * lambda;
                  for( int i = 2; i >= 0; i-- )
                  {
-                    int cost = a->i_satd_i8x8_dir[i][idx] = satd[i];
+                    int cost = satd[i];
+                    a->i_satd_i8x8_dir[i][idx] = cost + 4 * lambda;
                      COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
                  }
-                predict_mode += 3;
+
+                /* Take analysis shortcuts: don't analyse modes that are too
+                 * far away direction-wise from the favored mode. */
+                if( a->i_mbrd < 1 + a->b_fast_intra )
+                    predict_mode = intra_analysis_shortcut[predict_mode[8] >= 0][favor_vertical];
+                else
+                    predict_mode += 3;
              }
  
              for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
@@ -735,12 +760,12 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
  
                  i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
                  if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
-                    i_satd -= 3 * a->i_lambda;
+                    i_satd -= 3 * lambda;
  
                  COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
-                a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * a->i_lambda;
+                a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * lambda;
              }
-            i_cost += i_best + 3 * a->i_lambda;
+            i_cost += i_best + 3 * lambda;
  
              if( idx == 3 || i_cost > i_satd_thresh )
                  break;
@@ -774,7 +799,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
              i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
          }
          /* Not heavily tuned */
-        const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
+        static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
          if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
              return;
      }
@@ -782,20 +807,20 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
      /* 4x4 prediction selection */
      if( flags & X264_ANALYSE_I4x4 )
      {
-        int i_cost = a->i_lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
+        int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
          int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
          h->mb.i_cbp_luma = 0;
-        b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
+
          if( a->i_mbrd )
              i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
  
          if( h->sh.i_type == SLICE_TYPE_B )
-            i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
+            i_cost += lambda * i_mb_b_cost_table[I_4x4];
  
          for( idx = 0;; idx++ )
          {
-            uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
-            uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
+            pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
+            pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
              int i_best = COST_MAX;
              int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
  
@@ -803,16 +828,23 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
  
              if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                  /* emulate missing topright samples */
-                M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+                MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
  
-            if( b_merged_satd && predict_mode[5] >= 0 )
+            if( !h->mb.b_lossless && predict_mode[5] >= 0 )
              {
                  int satd[9];
                  h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
-                satd[i_pred_mode] -= 3 * a->i_lambda;
+                int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
+                satd[i_pred_mode] -= 3 * lambda;
                  for( int i = 2; i >= 0; i-- )
                      COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
-                predict_mode += 3;
+
+                /* Take analysis shortcuts: don't analyse modes that are too
+                 * far away direction-wise from the favored mode. */
+                if( a->i_mbrd < 1 + a->b_fast_intra )
+                    predict_mode = intra_analysis_shortcut[predict_mode[8] >= 0][favor_vertical];
+                else
+                    predict_mode += 3;
              }
  
              if( i_best > 0 )
@@ -830,7 +862,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                      i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
                      if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
                      {
-                        i_satd -= a->i_lambda * 3;
+                        i_satd -= lambda * 3;
                          if( i_satd <= 0 )
                          {
                              i_best = i_satd;
@@ -842,7 +874,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                      COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
                  }
              }
-            i_cost += i_best + 3 * a->i_lambda;
+            i_cost += i_best + 3 * lambda;
  
              if( i_cost > i_satd_thresh || idx == 15 )
                  break;
@@ -906,9 +938,7 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
  
  static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  {
-    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
-
-    int x, y;
+    pixel *p_dst = h->mb.pic.p_fdec[0];
      uint64_t i_satd, i_best;
      h->mb.i_skip_intra = 0;
  
@@ -975,18 +1005,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  
      if( h->mb.i_type == I_4x4 )
      {
-        uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
+        pixel4 pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
          int i_nnz = 0;
          for( int idx = 0; idx < 16; idx++ )
          {
-            uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
+            pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
              i_best = COST_MAX64;
  
              predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
  
              if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                  /* emulate missing topright samples */
-                M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+                MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
  
              for( ; *predict_mode >= 0; predict_mode++ )
              {
@@ -1001,18 +1031,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                  {
                      a->i_predict4x4[idx] = i_mode;
                      i_best = i_satd;
-                    pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
-                    pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
-                    pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
-                    pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
+                    pels[0] = MPIXEL_X4( p_dst_by+0*FDEC_STRIDE );
+                    pels[1] = MPIXEL_X4( p_dst_by+1*FDEC_STRIDE );
+                    pels[2] = MPIXEL_X4( p_dst_by+2*FDEC_STRIDE );
+                    pels[3] = MPIXEL_X4( p_dst_by+3*FDEC_STRIDE );
                      i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
                  }
              }
  
-            M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
-            M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
-            M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
-            M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
+            MPIXEL_X4( p_dst_by+0*FDEC_STRIDE ) = pels[0];
+            MPIXEL_X4( p_dst_by+1*FDEC_STRIDE ) = pels[1];
+            MPIXEL_X4( p_dst_by+2*FDEC_STRIDE ) = pels[2];
+            MPIXEL_X4( p_dst_by+3*FDEC_STRIDE ) = pels[3];
              h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
  
              h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
@@ -1020,19 +1050,20 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
      }
      else if( h->mb.i_type == I_8x8 )
      {
-        ALIGNED_ARRAY_16( uint8_t, edge,[33] );
+        ALIGNED_ARRAY_16( pixel, edge,[33] );
          for( int idx = 0; idx < 4; idx++ )
          {
-            uint64_t pels_h = 0;
-            uint8_t pels_v[7];
+            pixel4 pels_h[2] = {0};
+            pixel pels_v[7] = {0};
              uint16_t i_nnz[2] = {0}; //shut up gcc
-            uint8_t *p_dst_by;
+            pixel *p_dst_by;
              int cbp_luma_new = 0;
              int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
  
              i_best = COST_MAX64;
-            x = idx&1;
-            y = idx>>1;
+            int x = idx&1;
+            int y = idx>>1;
+            int s8 = X264_SCAN8_0 + 2*x + 16*y;
  
              p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
              predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
@@ -1057,21 +1088,23 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                      cbp_luma_new = h->mb.i_cbp_luma;
                      i_best = i_satd;
  
-                    pels_h = M64( p_dst_by+7*FDEC_STRIDE );
+                    pels_h[0] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 );
+                    pels_h[1] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 );
                      if( !(idx&1) )
                          for( int j = 0; j < 7; j++ )
                              pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
-                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
-                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
+                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
+                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
                  }
              }
              a->i_cbp_i8x8_luma = cbp_luma_new;
-            M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
+            MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 ) = pels_h[0];
+            MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 ) = pels_h[1];
              if( !(idx&1) )
                  for( int j = 0; j < 7; j++ )
                      p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
-            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
+            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
+            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
  
              x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
          }
@@ -1186,7 +1219,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
  static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
  {
      x264_me_t m;
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fenc = h->mb.pic.p_fenc;
      int i_maxref = h->mb.pic.i_fref[0]-1;
  
      h->mb.i_partition = D_8x8;
@@ -1219,8 +1252,8 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
      for( int i = 0; i < 4; i++ )
      {
          x264_me_t *l0m = &a->l0.me8x8[i];
-        const int x8 = i%2;
-        const int y8 = i/2;
+        int x8 = i&1;
+        int y8 = i>>1;
  
          m.i_pixel = PIXEL_8x8;
  
@@ -1282,7 +1315,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
       * don't bother analysing the dupes. */
      const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
      const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fenc = h->mb.pic.p_fenc;
      int i_mvc;
      int16_t (*mvc)[2] = a->l0.mvc[i_ref];
  
@@ -1295,8 +1328,8 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
      for( int i = 0; i < 4; i++ )
      {
          x264_me_t *m = &a->l0.me8x8[i];
-        const int x8 = i%2;
-        const int y8 = i/2;
+        int x8 = i&1;
+        int y8 = i>>1;
  
          m->i_pixel = PIXEL_8x8;
          m->i_ref_cost = i_ref_cost;
@@ -1334,7 +1367,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
  static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  {
      x264_me_t m;
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fenc = h->mb.pic.p_fenc;
      ALIGNED_4( int16_t mvc[3][2] );
  
      /* XXX Needed for x264_mb_predict_mv */
@@ -1400,7 +1433,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i
  static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  {
      x264_me_t m;
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fenc = h->mb.pic.p_fenc;
      ALIGNED_4( int16_t mvc[3][2] );
  
      /* XXX Needed for x264_mb_predict_mv */
@@ -1462,10 +1495,10 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
      a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
  }
  
-static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
+static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
  {
-    ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
-    uint8_t *pix2 = pix1+8;
+    ALIGNED_ARRAY_8( pixel, pix1,[16*8] );
+    pixel *pix2 = pix1+8;
      const int i_stride = h->mb.pic.i_stride[1];
      const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
      const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
@@ -1482,7 +1515,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
          weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
  
  
-    if( pixel == PIXEL_4x4 )
+    if( size == PIXEL_4x4 )
      {
          x264_me_t *m = a->l0.me4x4[i8x8];
          CHROMA4x4MC( 2,2, m[0], 0,0 );
@@ -1490,7 +1523,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
          CHROMA4x4MC( 2,2, m[2], 0,2 );
          CHROMA4x4MC( 2,2, m[3], 2,2 );
      }
-    else if( pixel == PIXEL_8x4 )
+    else if( size == PIXEL_8x4 )
      {
          x264_me_t *m = a->l0.me8x4[i8x8];
          CHROMA4x4MC( 4,2, m[0], 0,0 );
@@ -1509,8 +1542,8 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
  
  static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+    pixel **p_fenc = h->mb.pic.p_fenc;
      const int i_ref = a->l0.me8x8[i8x8].i_ref;
  
      /* XXX Needed for x264_mb_predict_mv */
@@ -1548,8 +1581,8 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
  
  static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+    pixel **p_fenc = h->mb.pic.p_fenc;
      const int i_ref = a->l0.me8x8[i8x8].i_ref;
  
      /* XXX Needed for x264_mb_predict_mv */
@@ -1584,8 +1617,8 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
  
  static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
-    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+    pixel **p_fenc = h->mb.pic.p_fenc;
      const int i_ref = a->l0.me8x8[i8x8].i_ref;
  
      /* XXX Needed for x264_mb_predict_mv */
@@ -1623,8 +1656,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
      /* Assumes that fdec still contains the results of
       * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
  
-    uint8_t *p_fenc = h->mb.pic.p_fenc[0];
-    uint8_t *p_fdec = h->mb.pic.p_fdec[0];
+    pixel *p_fenc = h->mb.pic.p_fenc[0];
+    pixel *p_fdec = h->mb.pic.p_fdec[0];
  
      a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
      if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
@@ -1645,9 +1678,9 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
  
  static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  {
-    ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
-    ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
-    uint8_t *src0, *src1;
+    ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
+    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+    pixel *src0, *src1;
      int stride0 = 16, stride1 = 16;
      int i_ref, i_mvc;
      ALIGNED_4( int16_t mvc[9][2] );
@@ -1776,8 +1809,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  
  static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
  {
-    const int x = 2*(i%2);
-    const int y = 2*(i/2);
+    int x = 2*(i&1);
+    int y = i&2;
  
      switch( h->mb.i_sub_partition[i] )
      {
@@ -1806,8 +1839,8 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int
  
  static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
  {
-    const int x = 2*(idx&1);
-    const int y = 2*(idx>>1);
+    int x = 2*(idx&1);
+    int y = idx&2;
      x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
      x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
      x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
@@ -1842,8 +1875,8 @@ static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
  
  static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
  {
-    int x = (i%2)*2;
-    int y = (i/2)*2;
+    int x = 2*(i&1);
+    int y = i&2;
      if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
      {
          x264_mb_load_mv_direct8x8( h, i );
@@ -1871,7 +1904,7 @@ static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int
  
  static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
  {
-    ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
+    ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
      int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
  
      /* early termination: if 16x16 chose ref 0, then evalute no refs older
@@ -1906,12 +1939,12 @@ static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
  
      for( int i = 0; i < 4; i++ )
      {
-        int x8 = i%2;
-        int y8 = i/2;
+        int x8 = i&1;
+        int y8 = i>>1;
          int i_part_cost;
          int i_part_cost_bi;
          int stride[2] = {8,8};
-        uint8_t *src[2];
+        pixel *src[2];
          x264_me_t m;
          m.i_pixel = PIXEL_8x8;
          LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
@@ -1976,10 +2009,10 @@ static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
  
  static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
  {
-    uint8_t **p_fref[2] =
+    pixel **p_fref[2] =
          { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
            h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
-    ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
+    ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
  
      /* XXX Needed for x264_mb_predict_mv */
      h->mb.i_partition = D_8x8;
@@ -1988,12 +2021,12 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
  
      for( int i = 0; i < 4; i++ )
      {
-        const int x8 = i%2;
-        const int y8 = i/2;
+        int x8 = i&1;
+        int y8 = i>>1;
          int i_part_cost;
          int i_part_cost_bi = 0;
          int stride[2] = {8,8};
-        uint8_t *src[2];
+        pixel *src[2];
  
          for( int l = 0; l < 2; l++ )
          {
@@ -2046,7 +2079,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
  
  static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  {
-    ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
+    ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
      ALIGNED_4( int16_t mvc[3][2] );
  
      h->mb.i_partition = D_16x8;
@@ -2057,7 +2090,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i
          int i_part_cost;
          int i_part_cost_bi = 0;
          int stride[2] = {16,16};
-        uint8_t *src[2];
+        pixel *src[2];
          x264_me_t m;
          m.i_pixel = PIXEL_16x8;
          LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
@@ -2137,7 +2170,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i
  
  static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  {
-    ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
+    ALIGNED_ARRAY_8( pixel, pix,[2],[8*16] );
      ALIGNED_4( int16_t mvc[3][2] );
  
      h->mb.i_partition = D_8x16;
@@ -2148,7 +2181,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i
          int i_part_cost;
          int i_part_cost_bi = 0;
          int stride[2] = {8,8};
-        uint8_t *src[2];
+        pixel *src[2];
          x264_me_t m;
          m.i_pixel = PIXEL_8x16;
          LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
@@ -2637,9 +2670,8 @@ intra_analysis:
              h->mb.i_partition = D_16x16;
              assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
              /* Set up MVs for future predictors */
-            if( b_skip )
-                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
-                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
+            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
+                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
          }
          else
          {